net/unix/af_unix.c

   1 /*
   2  * NET4:        Implementation of BSD Unix domain sockets.
   3  *
   4  * Authors:     Alan Cox, <alan@lxorguk.ukuu.org.uk>
   5  *
   6  *              This program is free software; you can redistribute it and/or
   7  *              modify it under the terms of the GNU General Public License
   8  *              as published by the Free Software Foundation; either version
   9  *              2 of the License, or (at your option) any later version.
  10  *
  11  * Fixes:
  12  *              Linus Torvalds  :       Assorted bug cures.
  13  *              Niibe Yutaka    :       async I/O support.
  14  *              Carsten Paeth   :       PF_UNIX check, address fixes.
  15  *              Alan Cox        :       Limit size of allocated blocks.
  16  *              Alan Cox        :       Fixed the stupid socketpair bug.
  17  *              Alan Cox        :       BSD compatibility fine tuning.
  18  *              Alan Cox        :       Fixed a bug in connect when interrupted.
  19  *              Alan Cox        :       Sorted out a proper draft version of
  20  *                                      file descriptor passing hacked up from
  21  *                                      Mike Shaver's work.
  22  *              Marty Leisner   :       Fixes to fd passing
  23  *              Nick Nevin      :       recvmsg bugfix.
  24  *              Alan Cox        :       Started proper garbage collector
  25  *              Heiko EiBfeldt  :       Missing verify_area check
  26  *              Alan Cox        :       Started POSIXisms
  27  *              Andreas Schwab  :       Replace inode by dentry for proper
  28  *                                      reference counting
  29  *              Kirk Petersen   :       Made this a module
  30  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
  31  *                                      Lots of bug fixes.
  32  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
  33  *                                      by above two patches.
  34  *           Andrea Arcangeli   :       If possible we block in connect(2)
  35  *                                      if the max backlog of the listen socket
  36  *                                      is been reached. This won't break
  37  *                                      old apps and it will avoid huge amount
  38  *                                      of socks hashed (this for unix_gc()
  39  *                                      performances reasons).
  40  *                                      Security fix that limits the max
  41  *                                      number of socks to 2*max_files and
  42  *                                      the number of skb queueable in the
  43  *                                      dgram receiver.
  44  *              Artur Skawina   :       Hash function optimizations
  45  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
  46  *            Malcolm Beattie   :       Set peercred for socketpair
  47  *           Michal Ostrowski   :       Module initialization cleanup.
  48  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
  49  *                                      the core infrastructure is doing that
  50  *                                      for all net proto families now (2.5.69+)
  51  *
  52  *
  53  * Known differences from reference BSD that was tested:
  54  *
  55  *      [TO FIX]
  56  *      ECONNREFUSED is not returned from one end of a connected() socket to the
  57  *              other the moment one end closes.
  58  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
  59  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
  60  *      [NOT TO FIX]
  61  *      accept() returns a path name even if the connecting socket has closed
  62  *              in the meantime (BSD loses the path and gives up).
  63  *      accept() returns 0 length path for an unbound connector. BSD returns 16
  64  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
  65  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
  66  *      BSD af_unix apparently has connect forgetting to block properly.
  67  *              (need to check this with the POSIX spec in detail)
  68  *
  69  * Differences from 2.0.0-11-... (ANK)
  70  *      Bug fixes and improvements.
  71  *              - client shutdown killed server socket.
  72  *              - removed all useless cli/sti pairs.
  73  *
  74  *      Semantic changes/extensions.
  75  *              - generic control message passing.
  76  *              - SCM_CREDENTIALS control message.
  77  *              - "Abstract" (not FS based) socket bindings.
  78  *                Abstract names are sequences of bytes (not zero terminated)
  79  *                started by 0, so that this name space does not intersect
  80  *                with BSD names.
  81  */
  82
  83 #include <linux/module.h>
  84 #include <linux/kernel.h>
  85 #include <linux/signal.h>
  86 #include <linux/sched.h>
  87 #include <linux/errno.h>
  88 #include <linux/string.h>
  89 #include <linux/stat.h>
  90 #include <linux/dcache.h>
  91 #include <linux/namei.h>
  92 #include <linux/socket.h>
  93 #include <linux/un.h>
  94 #include <linux/fcntl.h>
  95 #include <linux/termios.h>
  96 #include <linux/sockios.h>
  97 #include <linux/net.h>
  98 #include <linux/in.h>
  99 #include <linux/fs.h>
 100 #include <linux/slab.h>
 101 #include <asm/uaccess.h>
 102 #include <linux/skbuff.h>
 103 #include <linux/netdevice.h>
 104 #include <net/net_namespace.h>
 105 #include <net/sock.h>
 106 #include <net/tcp_states.h>
 107 #include <net/af_unix.h>
 108 #include <linux/proc_fs.h>
 109 #include <linux/seq_file.h>
 110 #include <net/scm.h>
 111 #include <linux/init.h>
 112 #include <linux/poll.h>
 113 #include <linux/rtnetlink.h>
 114 #include <linux/mount.h>
 115 #include <net/checksum.h>
 116 #include <linux/security.h>
 117 #include <linux/freezer.h>
 118
 119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
 120 EXPORT_SYMBOL_GPL(unix_socket_table);
 121 DEFINE_SPINLOCK(unix_table_lock);
 122 EXPORT_SYMBOL_GPL(unix_table_lock);
 123 static atomic_long_t unix_nr_socks;
 124
 125
 126 static struct hlist_head *unix_sockets_unbound(void *addr)
 127 {
 128         unsigned long hash = (unsigned long)addr;
 129
 130         hash ^= hash >> 16;
 131         hash ^= hash >> 8;
 132         hash %= UNIX_HASH_SIZE;
 133         return &unix_socket_table[UNIX_HASH_SIZE + hash];
 134 }
 135
 136 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
 137
 138 #ifdef CONFIG_SECURITY_NETWORK
 139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 140 {
 141         memcpy(UNIXSID(skb), &scm->secid, sizeof(u32));
 142 }
 143
 144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 145 {
 146         scm->secid = *UNIXSID(skb);
 147 }
 148 #else
 149 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 150 { }
 151
 152 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
 153 { }
 154 #endif /* CONFIG_SECURITY_NETWORK */
 155
 156 /*
 157  *  SMP locking strategy:
 158  *    hash table is protected with spinlock unix_table_lock
 159  *    each socket state is protected by separate spin lock.
 160  */
 161
 162 static inline unsigned int unix_hash_fold(__wsum n)
 163 {
 164         unsigned int hash = (__force unsigned int)n;
 165
 166         hash ^= hash>>16;
 167         hash ^= hash>>8;
 168         return hash&(UNIX_HASH_SIZE-1);
 169 }
 170
 171 #define unix_peer(sk) (unix_sk(sk)->peer)
 172
 173 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
 174 {
 175         return unix_peer(osk) == sk;
 176 }
 177
 178 static inline int unix_may_send(struct sock *sk, struct sock *osk)
 179 {
 180         return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
 181 }
 182
 183 static inline int unix_recvq_full(struct sock const *sk)
 184 {
 185         return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
 186 }
 187
 188 struct sock *unix_peer_get(struct sock *s)
 189 {
 190         struct sock *peer;
 191
 192         unix_state_lock(s);
 193         peer = unix_peer(s);
 194         if (peer)
 195                 sock_hold(peer);
 196         unix_state_unlock(s);
 197         return peer;
 198 }
 199 EXPORT_SYMBOL_GPL(unix_peer_get);
 200
 201 static inline void unix_release_addr(struct unix_address *addr)
 202 {
 203         if (atomic_dec_and_test(&addr->refcnt))
 204                 kfree(addr);
 205 }
 206
 207 /*
 208  *      Check unix socket name:
 209  *              - should be not zero length.
 210  *              - if started by not zero, should be NULL terminated (FS object)
 211  *              - if started by zero, it is abstract name.
 212  */
 213
 214 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
 215 {
 216         if (len <= sizeof(short) || len > sizeof(*sunaddr))
 217                 return -EINVAL;
 218         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
 219                 return -EINVAL;
 220         if (sunaddr->sun_path[0]) {
 221                 /*
 222                  * This may look like an off by one error but it is a bit more
 223                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
 224                  * sun_path[108] doesn't as such exist.  However in kernel space
 225                  * we are guaranteed that it is a valid memory location in our
 226                  * kernel address buffer.
 227                  */
 228                 ((char *)sunaddr)[len] = 0;
 229                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
 230                 return len;
 231         }
 232
 233         *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
 234         return len;
 235 }
 236
 237 static void __unix_remove_socket(struct sock *sk)
 238 {
 239         sk_del_node_init(sk);
 240 }
 241
 242 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
 243 {
 244         WARN_ON(!sk_unhashed(sk));
 245         sk_add_node(sk, list);
 246 }
 247
 248 static inline void unix_remove_socket(struct sock *sk)
 249 {
 250         spin_lock(&unix_table_lock);
 251         __unix_remove_socket(sk);
 252         spin_unlock(&unix_table_lock);
 253 }
 254
 255 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
 256 {
 257         spin_lock(&unix_table_lock);
 258         __unix_insert_socket(list, sk);
 259         spin_unlock(&unix_table_lock);
 260 }
 261
 262 static struct sock *__unix_find_socket_byname(struct net *net,
 263                                               struct sockaddr_un *sunname,
 264                                               int len, int type, unsigned int hash)
 265 {
 266         struct sock *s;
 267
 268         sk_for_each(s, &unix_socket_table[hash ^ type]) {
 269                 struct unix_sock *u = unix_sk(s);
 270
 271                 if (!net_eq(sock_net(s), net))
 272                         continue;
 273
 274                 if (u->addr->len == len &&
 275                     !memcmp(u->addr->name, sunname, len))
 276                         goto found;
 277         }
 278         s = NULL;
 279 found:
 280         return s;
 281 }
 282
 283 static inline struct sock *unix_find_socket_byname(struct net *net,
 284                                                    struct sockaddr_un *sunname,
 285                                                    int len, int type,
 286                                                    unsigned int hash)
 287 {
 288         struct sock *s;
 289
 290         spin_lock(&unix_table_lock);
 291         s = __unix_find_socket_byname(net, sunname, len, type, hash);
 292         if (s)
 293                 sock_hold(s);
 294         spin_unlock(&unix_table_lock);
 295         return s;
 296 }
 297
 298 static struct sock *unix_find_socket_byinode(struct inode *i)
 299 {
 300         struct sock *s;
 301
 302         spin_lock(&unix_table_lock);
 303         sk_for_each(s,
 304                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
 305                 struct dentry *dentry = unix_sk(s)->path.dentry;
 306
 307                 if (dentry && dentry->d_inode == i) {
 308                         sock_hold(s);
 309                         goto found;
 310                 }
 311         }
 312         s = NULL;
 313 found:
 314         spin_unlock(&unix_table_lock);
 315         return s;
 316 }
 317
 318 static inline int unix_writable(struct sock *sk)
 319 {
 320         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
 321 }
 322
 323 static void unix_write_space(struct sock *sk)
 324 {
 325         struct socket_wq *wq;
 326
 327         rcu_read_lock();
 328         if (unix_writable(sk)) {
 329                 wq = rcu_dereference(sk->sk_wq);
 330                 if (wq_has_sleeper(wq))
 331                         wake_up_interruptible_sync_poll(&wq->wait,
 332                                 POLLOUT | POLLWRNORM | POLLWRBAND);
 333                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
 334         }
 335         rcu_read_unlock();
 336 }
 337
 338 /* When dgram socket disconnects (or changes its peer), we clear its receive
 339  * queue of packets arrived from previous peer. First, it allows to do
 340  * flow control based only on wmem_alloc; second, sk connected to peer
 341  * may receive messages only from that peer. */
 342 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
 343 {
 344         if (!skb_queue_empty(&sk->sk_receive_queue)) {
 345                 skb_queue_purge(&sk->sk_receive_queue);
 346                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
 347
 348                 /* If one link of bidirectional dgram pipe is disconnected,
 349                  * we signal error. Messages are lost. Do not make this,
 350                  * when peer was not connected to us.
 351                  */
 352                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
 353                         other->sk_err = ECONNRESET;
 354                         other->sk_error_report(other);
 355                 }
 356         }
 357 }
 358
 359 static void unix_sock_destructor(struct sock *sk)
 360 {
 361         struct unix_sock *u = unix_sk(sk);
 362
 363         skb_queue_purge(&sk->sk_receive_queue);
 364
 365         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 366         WARN_ON(!sk_unhashed(sk));
 367         WARN_ON(sk->sk_socket);
 368         if (!sock_flag(sk, SOCK_DEAD)) {
 369                 printk(KERN_INFO "Attempt to release alive unix socket: %p\n", sk);
 370                 return;
 371         }
 372
 373         if (u->addr)
 374                 unix_release_addr(u->addr);
 375
 376         atomic_long_dec(&unix_nr_socks);
 377         local_bh_disable();
 378         sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
 379         local_bh_enable();
 380 #ifdef UNIX_REFCNT_DEBUG
 381         printk(KERN_DEBUG "UNIX %p is destroyed, %ld are still alive.\n", sk,
 382                 atomic_long_read(&unix_nr_socks));
 383 #endif
 384 }
 385
 386 static void unix_release_sock(struct sock *sk, int embrion)
 387 {
 388         struct unix_sock *u = unix_sk(sk);
 389         struct path path;
 390         struct sock *skpair;
 391         struct sk_buff *skb;
 392         int state;
 393
 394         unix_remove_socket(sk);
 395
 396         /* Clear state */
 397         unix_state_lock(sk);
 398         sock_orphan(sk);
 399         sk->sk_shutdown = SHUTDOWN_MASK;
 400         path         = u->path;
 401         u->path.dentry = NULL;
 402         u->path.mnt = NULL;
 403         state = sk->sk_state;
 404         sk->sk_state = TCP_CLOSE;
 405         unix_state_unlock(sk);
 406
 407         wake_up_interruptible_all(&u->peer_wait);
 408
 409         skpair = unix_peer(sk);
 410
 411         if (skpair != NULL) {
 412                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
 413                         unix_state_lock(skpair);
 414                         /* No more writes */
 415                         skpair->sk_shutdown = SHUTDOWN_MASK;
 416                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
 417                                 skpair->sk_err = ECONNRESET;
 418                         unix_state_unlock(skpair);
 419                         skpair->sk_state_change(skpair);
 420                         sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
 421                 }
 422                 sock_put(skpair); /* It may now die */
 423                 unix_peer(sk) = NULL;
 424         }
 425
 426         /* Try to flush out this socket. Throw out buffers at least */
 427
 428         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
 429                 if (state == TCP_LISTEN)
 430                         unix_release_sock(skb->sk, 1);
 431                 /* passed fds are erased in the kfree_skb hook        */
 432                 kfree_skb(skb);
 433         }
 434
 435         if (path.dentry)
 436                 path_put(&path);
 437
 438         sock_put(sk);
 439
 440         /* ---- Socket is dead now and most probably destroyed ---- */
 441
 442         /*
 443          * Fixme: BSD difference: In BSD all sockets connected to us get
 444          *        ECONNRESET and we die on the spot. In Linux we behave
 445          *        like files and pipes do and wait for the last
 446          *        dereference.
 447          *
 448          * Can't we simply set sock->err?
 449          *
 450          *        What the above comment does talk about? --ANK(980817)
 451          */
 452
 453         if (unix_tot_inflight)
 454                 unix_gc();              /* Garbage collect fds */
 455 }
 456
 457 static void init_peercred(struct sock *sk)
 458 {
 459         put_pid(sk->sk_peer_pid);
 460         if (sk->sk_peer_cred)
 461                 put_cred(sk->sk_peer_cred);
 462         sk->sk_peer_pid  = get_pid(task_tgid(current));
 463         sk->sk_peer_cred = get_current_cred();
 464 }
 465
 466 static void copy_peercred(struct sock *sk, struct sock *peersk)
 467 {
 468         put_pid(sk->sk_peer_pid);
 469         if (sk->sk_peer_cred)
 470                 put_cred(sk->sk_peer_cred);
 471         sk->sk_peer_pid  = get_pid(peersk->sk_peer_pid);
 472         sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
 473 }
 474
 475 static int unix_listen(struct socket *sock, int backlog)
 476 {
 477         int err;
 478         struct sock *sk = sock->sk;
 479         struct unix_sock *u = unix_sk(sk);
 480         struct pid *old_pid = NULL;
 481
 482         err = -EOPNOTSUPP;
 483         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
 484                 goto out;       /* Only stream/seqpacket sockets accept */
 485         err = -EINVAL;
 486         if (!u->addr)
 487                 goto out;       /* No listens on an unbound socket */
 488         unix_state_lock(sk);
 489         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
 490                 goto out_unlock;
 491         if (backlog > sk->sk_max_ack_backlog)
 492                 wake_up_interruptible_all(&u->peer_wait);
 493         sk->sk_max_ack_backlog  = backlog;
 494         sk->sk_state            = TCP_LISTEN;
 495         /* set credentials so connect can copy them */
 496         init_peercred(sk);
 497         err = 0;
 498
 499 out_unlock:
 500         unix_state_unlock(sk);
 501         put_pid(old_pid);
 502 out:
 503         return err;
 504 }
 505
 506 static int unix_release(struct socket *);
 507 static int unix_bind(struct socket *, struct sockaddr *, int);
 508 static int unix_stream_connect(struct socket *, struct sockaddr *,
 509                                int addr_len, int flags);
 510 static int unix_socketpair(struct socket *, struct socket *);
 511 static int unix_accept(struct socket *, struct socket *, int);
 512 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
 513 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
 514 static unsigned int unix_dgram_poll(struct file *, struct socket *,
 515                                     poll_table *);
 516 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
 517 static int unix_shutdown(struct socket *, int);
 518 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
 519                                struct msghdr *, size_t);
 520 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
 521                                struct msghdr *, size_t, int);
 522 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
 523                               struct msghdr *, size_t);
 524 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
 525                               struct msghdr *, size_t, int);
 526 static int unix_dgram_connect(struct socket *, struct sockaddr *,
 527                               int, int);
 528 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
 529                                   struct msghdr *, size_t);
 530 static int unix_seqpacket_recvmsg(struct kiocb *, struct socket *,
 531                                   struct msghdr *, size_t, int);
 532
 533 static void unix_set_peek_off(struct sock *sk, int val)
 534 {
 535         struct unix_sock *u = unix_sk(sk);
 536
 537         mutex_lock(&u->readlock);
 538         sk->sk_peek_off = val;
 539         mutex_unlock(&u->readlock);
 540 }
 541
 542
 543 static const struct proto_ops unix_stream_ops = {
 544         .family =       PF_UNIX,
 545         .owner =        THIS_MODULE,
 546         .release =      unix_release,
 547         .bind =         unix_bind,
 548         .connect =      unix_stream_connect,
 549         .socketpair =   unix_socketpair,
 550         .accept =       unix_accept,
 551         .getname =      unix_getname,
 552         .poll =         unix_poll,
 553         .ioctl =        unix_ioctl,
 554         .listen =       unix_listen,
 555         .shutdown =     unix_shutdown,
 556         .setsockopt =   sock_no_setsockopt,
 557         .getsockopt =   sock_no_getsockopt,
 558         .sendmsg =      unix_stream_sendmsg,
 559         .recvmsg =      unix_stream_recvmsg,
 560         .mmap =         sock_no_mmap,
 561         .sendpage =     sock_no_sendpage,
 562         .set_peek_off = unix_set_peek_off,
 563 };
 564
 565 static const struct proto_ops unix_dgram_ops = {
 566         .family =       PF_UNIX,
 567         .owner =        THIS_MODULE,
 568         .release =      unix_release,
 569         .bind =         unix_bind,
 570         .connect =      unix_dgram_connect,
 571         .socketpair =   unix_socketpair,
 572         .accept =       sock_no_accept,
 573         .getname =      unix_getname,
 574         .poll =         unix_dgram_poll,
 575         .ioctl =        unix_ioctl,
 576         .listen =       sock_no_listen,
 577         .shutdown =     unix_shutdown,
 578         .setsockopt =   sock_no_setsockopt,
 579         .getsockopt =   sock_no_getsockopt,
 580         .sendmsg =      unix_dgram_sendmsg,
 581         .recvmsg =      unix_dgram_recvmsg,
 582         .mmap =         sock_no_mmap,
 583         .sendpage =     sock_no_sendpage,
 584         .set_peek_off = unix_set_peek_off,
 585 };
 586
 587 static const struct proto_ops unix_seqpacket_ops = {
 588         .family =       PF_UNIX,
 589         .owner =        THIS_MODULE,
 590         .release =      unix_release,
 591         .bind =         unix_bind,
 592         .connect =      unix_stream_connect,
 593         .socketpair =   unix_socketpair,
 594         .accept =       unix_accept,
 595         .getname =      unix_getname,
 596         .poll =         unix_dgram_poll,
 597         .ioctl =        unix_ioctl,
 598         .listen =       unix_listen,
 599         .shutdown =     unix_shutdown,
 600         .setsockopt =   sock_no_setsockopt,
 601         .getsockopt =   sock_no_getsockopt,
 602         .sendmsg =      unix_seqpacket_sendmsg,
 603         .recvmsg =      unix_seqpacket_recvmsg,
 604         .mmap =         sock_no_mmap,
 605         .sendpage =     sock_no_sendpage,
 606         .set_peek_off = unix_set_peek_off,
 607 };
 608
 609 static struct proto unix_proto = {
 610         .name                   = "UNIX",
 611         .owner                  = THIS_MODULE,
 612         .obj_size               = sizeof(struct unix_sock),
 613 };
 614
 615 /*
 616  * AF_UNIX sockets do not interact with hardware, hence they
 617  * dont trigger interrupts - so it's safe for them to have
 618  * bh-unsafe locking for their sk_receive_queue.lock. Split off
 619  * this special lock-class by reinitializing the spinlock key:
 620  */
 621 static struct lock_class_key af_unix_sk_receive_queue_lock_key;
 622
 623 static struct sock *unix_create1(struct net *net, struct socket *sock)
 624 {
 625         struct sock *sk = NULL;
 626         struct unix_sock *u;
 627
 628         atomic_long_inc(&unix_nr_socks);
 629         if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
 630                 goto out;
 631
 632         sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto);
 633         if (!sk)
 634                 goto out;
 635
 636         sock_init_data(sock, sk);
 637         lockdep_set_class(&sk->sk_receive_queue.lock,
 638                                 &af_unix_sk_receive_queue_lock_key);
 639
 640         sk->sk_write_space      = unix_write_space;
 641         sk->sk_max_ack_backlog  = net->unx.sysctl_max_dgram_qlen;
 642         sk->sk_destruct         = unix_sock_destructor;
 643         u         = unix_sk(sk);
 644         u->path.dentry = NULL;
 645         u->path.mnt = NULL;
 646         spin_lock_init(&u->lock);
 647         atomic_long_set(&u->inflight, 0);
 648         INIT_LIST_HEAD(&u->link);
 649         mutex_init(&u->readlock); /* single task reading lock */
 650         init_waitqueue_head(&u->peer_wait);
 651         unix_insert_socket(unix_sockets_unbound(sk), sk);
 652 out:
 653         if (sk == NULL)
 654                 atomic_long_dec(&unix_nr_socks);
 655         else {
 656                 local_bh_disable();
 657                 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
 658                 local_bh_enable();
 659         }
 660         return sk;
 661 }
 662
 663 static int unix_create(struct net *net, struct socket *sock, int protocol,
 664                        int kern)
 665 {
 666         if (protocol && protocol != PF_UNIX)
 667                 return -EPROTONOSUPPORT;
 668
 669         sock->state = SS_UNCONNECTED;
 670
 671         switch (sock->type) {
 672         case SOCK_STREAM:
 673                 sock->ops = &unix_stream_ops;
 674                 break;
 675                 /*
 676                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
 677                  *      nothing uses it.
 678                  */
 679         case SOCK_RAW:
 680                 sock->type = SOCK_DGRAM;
 681         case SOCK_DGRAM:
 682                 sock->ops = &unix_dgram_ops;
 683                 break;
 684         case SOCK_SEQPACKET:
 685                 sock->ops = &unix_seqpacket_ops;
 686                 break;
 687         default:
 688                 return -ESOCKTNOSUPPORT;
 689         }
 690
 691         return unix_create1(net, sock) ? 0 : -ENOMEM;
 692 }
 693
 694 static int unix_release(struct socket *sock)
 695 {
 696         struct sock *sk = sock->sk;
 697
 698         if (!sk)
 699                 return 0;
 700
 701         unix_release_sock(sk, 0);
 702         sock->sk = NULL;
 703
 704         return 0;
 705 }
 706
 707 static int unix_autobind(struct socket *sock)
 708 {
 709         struct sock *sk = sock->sk;
 710         struct net *net = sock_net(sk);
 711         struct unix_sock *u = unix_sk(sk);
 712         static u32 ordernum = 1;
 713         struct unix_address *addr;
 714         int err;
 715         unsigned int retries = 0;
 716
 717         mutex_lock(&u->readlock);
 718
 719         err = 0;
 720         if (u->addr)
 721                 goto out;
 722
 723         err = -ENOMEM;
 724         addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
 725         if (!addr)
 726                 goto out;
 727
 728         addr->name->sun_family = AF_UNIX;
 729         atomic_set(&addr->refcnt, 1);
 730
 731 retry:
 732         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
 733         addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
 734
 735         spin_lock(&unix_table_lock);
 736         ordernum = (ordernum+1)&0xFFFFF;
 737
 738         if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
 739                                       addr->hash)) {
 740                 spin_unlock(&unix_table_lock);
 741                 /*
 742                  * __unix_find_socket_byname() may take long time if many names
 743                  * are already in use.
 744                  */
 745                 cond_resched();
 746                 /* Give up if all names seems to be in use. */
 747                 if (retries++ == 0xFFFFF) {
 748                         err = -ENOSPC;
 749                         kfree(addr);
 750                         goto out;
 751                 }
 752                 goto retry;
 753         }
 754         addr->hash ^= sk->sk_type;
 755
 756         __unix_remove_socket(sk);
 757         u->addr = addr;
 758         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
 759         spin_unlock(&unix_table_lock);
 760         err = 0;
 761
 762 out:    mutex_unlock(&u->readlock);
 763         return err;
 764 }
 765
 766 static struct sock *unix_find_other(struct net *net,
 767                                     struct sockaddr_un *sunname, int len,
 768                                     int type, unsigned int hash, int *error)
 769 {
 770         struct sock *u;
 771         struct path path;
 772         int err = 0;
 773
 774         if (sunname->sun_path[0]) {
 775                 struct inode *inode;
 776                 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
 777                 if (err)
 778                         goto fail;
 779                 inode = path.dentry->d_inode;
 780                 err = inode_permission(inode, MAY_WRITE);
 781                 if (err)
 782                         goto put_fail;
 783
 784                 err = -ECONNREFUSED;
 785                 if (!S_ISSOCK(inode->i_mode))
 786                         goto put_fail;
 787                 u = unix_find_socket_byinode(inode);
 788                 if (!u)
 789                         goto put_fail;
 790
 791                 if (u->sk_type == type)
 792                         touch_atime(&path);
 793
 794                 path_put(&path);
 795
 796                 err = -EPROTOTYPE;
 797                 if (u->sk_type != type) {
 798                         sock_put(u);
 799                         goto fail;
 800                 }
 801         } else {
 802                 err = -ECONNREFUSED;
 803                 u = unix_find_socket_byname(net, sunname, len, type, hash);
 804                 if (u) {
 805                         struct dentry *dentry;
 806                         dentry = unix_sk(u)->path.dentry;
 807                         if (dentry)
 808                                 touch_atime(&unix_sk(u)->path);
 809                 } else
 810                         goto fail;
 811         }
 812         return u;
 813
 814 put_fail:
 815         path_put(&path);
 816 fail:
 817         *error = err;
 818         return NULL;
 819 }
 820
 821 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
 822 {
 823         struct dentry *dentry;
 824         struct path path;
 825         int err = 0;
 826         /*
 827          * Get the parent directory, calculate the hash for last
 828          * component.
 829          */
 830         dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
 831         err = PTR_ERR(dentry);
 832         if (IS_ERR(dentry))
 833                 return err;
 834
 835         /*
 836          * All right, let's create it.
 837          */
 838         err = security_path_mknod(&path, dentry, mode, 0);
 839         if (!err) {
 840                 err = vfs_mknod(path.dentry->d_inode, dentry, mode, 0);
 841                 if (!err) {
 842                         res->mnt = mntget(path.mnt);
 843                         res->dentry = dget(dentry);
 844                 }
 845         }
 846         done_path_create(&path, dentry);
 847         return err;
 848 }
 849
 850 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 851 {
 852         struct sock *sk = sock->sk;
 853         struct net *net = sock_net(sk);
 854         struct unix_sock *u = unix_sk(sk);
 855         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
 856         char *sun_path = sunaddr->sun_path;
 857         int err;
 858         unsigned int hash;
 859         struct unix_address *addr;
 860         struct hlist_head *list;
 861
 862         err = -EINVAL;
 863         if (sunaddr->sun_family != AF_UNIX)
 864                 goto out;
 865
 866         if (addr_len == sizeof(short)) {
 867                 err = unix_autobind(sock);
 868                 goto out;
 869         }
 870
 871         err = unix_mkname(sunaddr, addr_len, &hash);
 872         if (err < 0)
 873                 goto out;
 874         addr_len = err;
 875
 876         mutex_lock(&u->readlock);
 877
 878         err = -EINVAL;
 879         if (u->addr)
 880                 goto out_up;
 881
 882         err = -ENOMEM;
 883         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
 884         if (!addr)
 885                 goto out_up;
 886
 887         memcpy(addr->name, sunaddr, addr_len);
 888         addr->len = addr_len;
 889         addr->hash = hash ^ sk->sk_type;
 890         atomic_set(&addr->refcnt, 1);
 891
 892         if (sun_path[0]) {
 893                 struct path path;
 894                 umode_t mode = S_IFSOCK |
 895                        (SOCK_INODE(sock)->i_mode & ~current_umask());
 896                 err = unix_mknod(sun_path, mode, &path);
 897                 if (err) {
 898                         if (err == -EEXIST)
 899                                 err = -EADDRINUSE;
 900                         unix_release_addr(addr);
 901                         goto out_up;
 902                 }
 903                 addr->hash = UNIX_HASH_SIZE;
 904                 hash = path.dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1);
 905                 spin_lock(&unix_table_lock);
 906                 u->path = path;
 907                 list = &unix_socket_table[hash];
 908         } else {
 909                 spin_lock(&unix_table_lock);
 910                 err = -EADDRINUSE;
 911                 if (__unix_find_socket_byname(net, sunaddr, addr_len,
 912                                               sk->sk_type, hash)) {
 913                         unix_release_addr(addr);
 914                         goto out_unlock;
 915                 }
 916
 917                 list = &unix_socket_table[addr->hash];
 918         }
 919
 920         err = 0;
 921         __unix_remove_socket(sk);
 922         u->addr = addr;
 923         __unix_insert_socket(list, sk);
 924
 925 out_unlock:
 926         spin_unlock(&unix_table_lock);
 927 out_up:
 928         mutex_unlock(&u->readlock);
 929 out:
 930         return err;
 931 }
 932
 933 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
 934 {
 935         if (unlikely(sk1 == sk2) || !sk2) {
 936                 unix_state_lock(sk1);
 937                 return;
 938         }
 939         if (sk1 < sk2) {
 940                 unix_state_lock(sk1);
 941                 unix_state_lock_nested(sk2);
 942         } else {
 943                 unix_state_lock(sk2);
 944                 unix_state_lock_nested(sk1);
 945         }
 946 }
 947
 948 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
 949 {
 950         if (unlikely(sk1 == sk2) || !sk2) {
 951                 unix_state_unlock(sk1);
 952                 return;
 953         }
 954         unix_state_unlock(sk1);
 955         unix_state_unlock(sk2);
 956 }
 957
 958 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
 959                               int alen, int flags)
 960 {
 961         struct sock *sk = sock->sk;
 962         struct net *net = sock_net(sk);
 963         struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
 964         struct sock *other;
 965         unsigned int hash;
 966         int err;
 967
 968         if (addr->sa_family != AF_UNSPEC) {
 969                 err = unix_mkname(sunaddr, alen, &hash);
 970                 if (err < 0)
 971                         goto out;
 972                 alen = err;
 973
 974                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
 975                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
 976                         goto out;
 977
 978 restart:
 979                 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
 980                 if (!other)
 981                         goto out;
 982
 983                 unix_state_double_lock(sk, other);
 984
 985                 /* Apparently VFS overslept socket death. Retry. */
 986                 if (sock_flag(other, SOCK_DEAD)) {
 987                         unix_state_double_unlock(sk, other);
 988                         sock_put(other);
 989                         goto restart;
 990                 }
 991
 992                 err = -EPERM;
 993                 if (!unix_may_send(sk, other))
 994                         goto out_unlock;
 995
 996                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
 997                 if (err)
 998                         goto out_unlock;
 999
1000         } else {
1001                 /*
1002                  *      1003.1g breaking connected state with AF_UNSPEC
1003                  */
1004                 other = NULL;
1005                 unix_state_double_lock(sk, other);
1006         }
1007
1008         /*
1009          * If it was connected, reconnect.
1010          */
1011         if (unix_peer(sk)) {
1012                 struct sock *old_peer = unix_peer(sk);
1013                 unix_peer(sk) = other;
1014                 unix_state_double_unlock(sk, other);
1015
1016                 if (other != old_peer)
1017                         unix_dgram_disconnected(sk, old_peer);
1018                 sock_put(old_peer);
1019         } else {
1020                 unix_peer(sk) = other;
1021                 unix_state_double_unlock(sk, other);
1022         }
1023         return 0;
1024
1025 out_unlock:
1026         unix_state_double_unlock(sk, other);
1027         sock_put(other);
1028 out:
1029         return err;
1030 }
1031
1032 static long unix_wait_for_peer(struct sock *other, long timeo)
1033 {
1034         struct unix_sock *u = unix_sk(other);
1035         int sched;
1036         DEFINE_WAIT(wait);
1037
1038         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1039
1040         sched = !sock_flag(other, SOCK_DEAD) &&
1041                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1042                 unix_recvq_full(other);
1043
1044         unix_state_unlock(other);
1045
1046         if (sched)
1047                 timeo = schedule_timeout(timeo);
1048
1049         finish_wait(&u->peer_wait, &wait);
1050         return timeo;
1051 }
1052
1053 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1054                                int addr_len, int flags)
1055 {
1056         struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1057         struct sock *sk = sock->sk;
1058         struct net *net = sock_net(sk);
1059         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1060         struct sock *newsk = NULL;
1061         struct sock *other = NULL;
1062         struct sk_buff *skb = NULL;
1063         unsigned int hash;
1064         int st;
1065         int err;
1066         long timeo;
1067
1068         err = unix_mkname(sunaddr, addr_len, &hash);
1069         if (err < 0)
1070                 goto out;
1071         addr_len = err;
1072
1073         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1074             (err = unix_autobind(sock)) != 0)
1075                 goto out;
1076
1077         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1078
1079         /* First of all allocate resources.
1080            If we will make it after state is locked,
1081            we will have to recheck all again in any case.
1082          */
1083
1084         err = -ENOMEM;
1085
1086         /* create new sock for complete connection */
1087         newsk = unix_create1(sock_net(sk), NULL);
1088         if (newsk == NULL)
1089                 goto out;
1090
1091         /* Allocate skb for sending to listening sock */
1092         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1093         if (skb == NULL)
1094                 goto out;
1095
1096 restart:
1097         /*  Find listening sock. */
1098         other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1099         if (!other)
1100                 goto out;
1101
1102         /* Latch state of peer */
1103         unix_state_lock(other);
1104
1105         /* Apparently VFS overslept socket death. Retry. */
1106         if (sock_flag(other, SOCK_DEAD)) {
1107                 unix_state_unlock(other);
1108                 sock_put(other);
1109                 goto restart;
1110         }
1111
1112         err = -ECONNREFUSED;
1113         if (other->sk_state != TCP_LISTEN)
1114                 goto out_unlock;
1115         if (other->sk_shutdown & RCV_SHUTDOWN)
1116                 goto out_unlock;
1117
1118         if (unix_recvq_full(other)) {
1119                 err = -EAGAIN;
1120                 if (!timeo)
1121                         goto out_unlock;
1122
1123                 timeo = unix_wait_for_peer(other, timeo);
1124
1125                 err = sock_intr_errno(timeo);
1126                 if (signal_pending(current))
1127                         goto out;
1128                 sock_put(other);
1129                 goto restart;
1130         }
1131
1132         /* Latch our state.
1133
1134            It is tricky place. We need to grab our state lock and cannot
1135            drop lock on peer. It is dangerous because deadlock is
1136            possible. Connect to self case and simultaneous
1137            attempt to connect are eliminated by checking socket
1138            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1139            check this before attempt to grab lock.
1140
1141            Well, and we have to recheck the state after socket locked.
1142          */
1143         st = sk->sk_state;
1144
1145         switch (st) {
1146         case TCP_CLOSE:
1147                 /* This is ok... continue with connect */
1148                 break;
1149         case TCP_ESTABLISHED:
1150                 /* Socket is already connected */
1151                 err = -EISCONN;
1152                 goto out_unlock;
1153         default:
1154                 err = -EINVAL;
1155                 goto out_unlock;
1156         }
1157
1158         unix_state_lock_nested(sk);
1159
1160         if (sk->sk_state != st) {
1161                 unix_state_unlock(sk);
1162                 unix_state_unlock(other);
1163                 sock_put(other);
1164                 goto restart;
1165         }
1166
1167         err = security_unix_stream_connect(sk, other, newsk);
1168         if (err) {
1169                 unix_state_unlock(sk);
1170                 goto out_unlock;
1171         }
1172
1173         /* The way is open! Fastly set all the necessary fields... */
1174
1175         sock_hold(sk);
1176         unix_peer(newsk)        = sk;
1177         newsk->sk_state         = TCP_ESTABLISHED;
1178         newsk->sk_type          = sk->sk_type;
1179         init_peercred(newsk);
1180         newu = unix_sk(newsk);
1181         RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1182         otheru = unix_sk(other);
1183
1184         /* copy address information from listening to new sock*/
1185         if (otheru->addr) {
1186                 atomic_inc(&otheru->addr->refcnt);
1187                 newu->addr = otheru->addr;
1188         }
1189         if (otheru->path.dentry) {
1190                 path_get(&otheru->path);
1191                 newu->path = otheru->path;
1192         }
1193
1194         /* Set credentials */
1195         copy_peercred(sk, other);
1196
1197         sock->state     = SS_CONNECTED;
1198         sk->sk_state    = TCP_ESTABLISHED;
1199         sock_hold(newsk);
1200
1201         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1202         unix_peer(sk)   = newsk;
1203
1204         unix_state_unlock(sk);
1205
1206         /* take ten and and send info to listening sock */
1207         spin_lock(&other->sk_receive_queue.lock);
1208         __skb_queue_tail(&other->sk_receive_queue, skb);
1209         spin_unlock(&other->sk_receive_queue.lock);
1210         unix_state_unlock(other);
1211         other->sk_data_ready(other, 0);
1212         sock_put(other);
1213         return 0;
1214
1215 out_unlock:
1216         if (other)
1217                 unix_state_unlock(other);
1218
1219 out:
1220         kfree_skb(skb);
1221         if (newsk)
1222                 unix_release_sock(newsk, 0);
1223         if (other)
1224                 sock_put(other);
1225         return err;
1226 }
1227
1228 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1229 {
1230         struct sock *ska = socka->sk, *skb = sockb->sk;
1231
1232         /* Join our sockets back to back */
1233         sock_hold(ska);
1234         sock_hold(skb);
1235         unix_peer(ska) = skb;
1236         unix_peer(skb) = ska;
1237         init_peercred(ska);
1238         init_peercred(skb);
1239
1240         if (ska->sk_type != SOCK_DGRAM) {
1241                 ska->sk_state = TCP_ESTABLISHED;
1242                 skb->sk_state = TCP_ESTABLISHED;
1243                 socka->state  = SS_CONNECTED;
1244                 sockb->state  = SS_CONNECTED;
1245         }
1246         return 0;
1247 }
1248
1249 static void unix_sock_inherit_flags(const struct socket *old,
1250                                     struct socket *new)
1251 {
1252         if (test_bit(SOCK_PASSCRED, &old->flags))
1253                 set_bit(SOCK_PASSCRED, &new->flags);
1254         if (test_bit(SOCK_PASSSEC, &old->flags))
1255                 set_bit(SOCK_PASSSEC, &new->flags);
1256 }
1257
1258 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1259 {
1260         struct sock *sk = sock->sk;
1261         struct sock *tsk;
1262         struct sk_buff *skb;
1263         int err;
1264
1265         err = -EOPNOTSUPP;
1266         if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1267                 goto out;
1268
1269         err = -EINVAL;
1270         if (sk->sk_state != TCP_LISTEN)
1271                 goto out;
1272
1273         /* If socket state is TCP_LISTEN it cannot change (for now...),
1274          * so that no locks are necessary.
1275          */
1276
1277         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1278         if (!skb) {
1279                 /* This means receive shutdown. */
1280                 if (err == 0)
1281                         err = -EINVAL;
1282                 goto out;
1283         }
1284
1285         tsk = skb->sk;
1286         skb_free_datagram(sk, skb);
1287         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1288
1289         /* attach accepted sock to socket */
1290         unix_state_lock(tsk);
1291         newsock->state = SS_CONNECTED;
1292         unix_sock_inherit_flags(sock, newsock);
1293         sock_graft(tsk, newsock);
1294         unix_state_unlock(tsk);
1295         return 0;
1296
1297 out:
1298         return err;
1299 }
1300
1301
1302 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1303 {
1304         struct sock *sk = sock->sk;
1305         struct unix_sock *u;
1306         DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1307         int err = 0;
1308
1309         if (peer) {
1310                 sk = unix_peer_get(sk);
1311
1312                 err = -ENOTCONN;
1313                 if (!sk)
1314                         goto out;
1315                 err = 0;
1316         } else {
1317                 sock_hold(sk);
1318         }
1319
1320         u = unix_sk(sk);
1321         unix_state_lock(sk);
1322         if (!u->addr) {
1323                 sunaddr->sun_family = AF_UNIX;
1324                 sunaddr->sun_path[0] = 0;
1325                 *uaddr_len = sizeof(short);
1326         } else {
1327                 struct unix_address *addr = u->addr;
1328
1329                 *uaddr_len = addr->len;
1330                 memcpy(sunaddr, addr->name, *uaddr_len);
1331         }
1332         unix_state_unlock(sk);
1333         sock_put(sk);
1334 out:
1335         return err;
1336 }
1337
1338 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1339 {
1340         int i;
1341
1342         scm->fp = UNIXCB(skb).fp;
1343         UNIXCB(skb).fp = NULL;
1344
1345         for (i = scm->fp->count-1; i >= 0; i--)
1346                 unix_notinflight(scm->fp->fp[i]);
1347 }
1348
1349 static void unix_destruct_scm(struct sk_buff *skb)
1350 {
1351         struct scm_cookie scm;
1352         memset(&scm, 0, sizeof(scm));
1353         scm.pid  = UNIXCB(skb).pid;
1354         if (UNIXCB(skb).fp)
1355                 unix_detach_fds(&scm, skb);
1356
1357         /* Alas, it calls VFS */
1358         /* So fscking what? fput() had been SMP-safe since the last Summer */
1359         scm_destroy(&scm);
1360         sock_wfree(skb);
1361 }
1362
1363 #define MAX_RECURSION_LEVEL 4
1364
1365 static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1366 {
1367         int i;
1368         unsigned char max_level = 0;
1369         int unix_sock_count = 0;
1370
1371         for (i = scm->fp->count - 1; i >= 0; i--) {
1372                 struct sock *sk = unix_get_socket(scm->fp->fp[i]);
1373
1374                 if (sk) {
1375                         unix_sock_count++;
1376                         max_level = max(max_level,
1377                                         unix_sk(sk)->recursion_level);
1378                 }
1379         }
1380         if (unlikely(max_level > MAX_RECURSION_LEVEL))
1381                 return -ETOOMANYREFS;
1382
1383         /*
1384          * Need to duplicate file references for the sake of garbage
1385          * collection.  Otherwise a socket in the fps might become a
1386          * candidate for GC while the skb is not yet queued.
1387          */
1388         UNIXCB(skb).fp = scm_fp_dup(scm->fp);
1389         if (!UNIXCB(skb).fp)
1390                 return -ENOMEM;
1391
1392         if (unix_sock_count) {
1393                 for (i = scm->fp->count - 1; i >= 0; i--)
1394                         unix_inflight(scm->fp->fp[i]);
1395         }
1396         return max_level;
1397 }
1398
1399 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1400 {
1401         int err = 0;
1402
1403         UNIXCB(skb).pid  = get_pid(scm->pid);
1404         UNIXCB(skb).uid = scm->creds.uid;
1405         UNIXCB(skb).gid = scm->creds.gid;
1406         UNIXCB(skb).fp = NULL;
1407         if (scm->fp && send_fds)
1408                 err = unix_attach_fds(scm, skb);
1409
1410         skb->destructor = unix_destruct_scm;
1411         return err;
1412 }
1413
1414 /*
1415  * Some apps rely on write() giving SCM_CREDENTIALS
1416  * We include credentials if source or destination socket
1417  * asserted SOCK_PASSCRED.
1418  */
1419 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1420                             const struct sock *other)
1421 {
1422         if (UNIXCB(skb).pid)
1423                 return;
1424         if (test_bit(SOCK_PASSCRED, &sock->flags) ||
1425             !other->sk_socket ||
1426             test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
1427                 UNIXCB(skb).pid  = get_pid(task_tgid(current));
1428                 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1429         }
1430 }
1431
1432 /*
1433  *      Send AF_UNIX data.
1434  */
1435
1436 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1437                               struct msghdr *msg, size_t len)
1438 {
1439         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1440         struct sock *sk = sock->sk;
1441         struct net *net = sock_net(sk);
1442         struct unix_sock *u = unix_sk(sk);
1443         struct sockaddr_un *sunaddr = msg->msg_name;
1444         struct sock *other = NULL;
1445         int namelen = 0; /* fake GCC */
1446         int err;
1447         unsigned int hash;
1448         struct sk_buff *skb;
1449         long timeo;
1450         struct scm_cookie tmp_scm;
1451         int max_level;
1452         int data_len = 0;
1453
1454         if (NULL == siocb->scm)
1455                 siocb->scm = &tmp_scm;
1456         wait_for_unix_gc();
1457         err = scm_send(sock, msg, siocb->scm, false);
1458         if (err < 0)
1459                 return err;
1460
1461         err = -EOPNOTSUPP;
1462         if (msg->msg_flags&MSG_OOB)
1463                 goto out;
1464
1465         if (msg->msg_namelen) {
1466                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1467                 if (err < 0)
1468                         goto out;
1469                 namelen = err;
1470         } else {
1471                 sunaddr = NULL;
1472                 err = -ENOTCONN;
1473                 other = unix_peer_get(sk);
1474                 if (!other)
1475                         goto out;
1476         }
1477
1478         if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1479             && (err = unix_autobind(sock)) != 0)
1480                 goto out;
1481
1482         err = -EMSGSIZE;
1483         if (len > sk->sk_sndbuf - 32)
1484                 goto out;
1485
1486         if (len > SKB_MAX_ALLOC)
1487                 data_len = min_t(size_t,
1488                                  len - SKB_MAX_ALLOC,
1489                                  MAX_SKB_FRAGS * PAGE_SIZE);
1490
1491         skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1492                                    msg->msg_flags & MSG_DONTWAIT, &err,
1493                                    PAGE_ALLOC_COSTLY_ORDER);
1494         if (skb == NULL)
1495                 goto out;
1496
1497         err = unix_scm_to_skb(siocb->scm, skb, true);
1498         if (err < 0)
1499                 goto out_free;
1500         max_level = err + 1;
1501         unix_get_secdata(siocb->scm, skb);
1502
1503         skb_put(skb, len - data_len);
1504         skb->data_len = data_len;
1505         skb->len = len;
1506         err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov, 0, len);
1507         if (err)
1508                 goto out_free;
1509
1510         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1511
1512 restart:
1513         if (!other) {
1514                 err = -ECONNRESET;
1515                 if (sunaddr == NULL)
1516                         goto out_free;
1517
1518                 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1519                                         hash, &err);
1520                 if (other == NULL)
1521                         goto out_free;
1522         }
1523
1524         if (sk_filter(other, skb) < 0) {
1525                 /* Toss the packet but do not return any error to the sender */
1526                 err = len;
1527                 goto out_free;
1528         }
1529
1530         unix_state_lock(other);
1531         err = -EPERM;
1532         if (!unix_may_send(sk, other))
1533                 goto out_unlock;
1534
1535         if (sock_flag(other, SOCK_DEAD)) {
1536                 /*
1537                  *      Check with 1003.1g - what should
1538                  *      datagram error
1539                  */
1540                 unix_state_unlock(other);
1541                 sock_put(other);
1542
1543                 err = 0;
1544                 unix_state_lock(sk);
1545                 if (unix_peer(sk) == other) {
1546                         unix_peer(sk) = NULL;
1547                         unix_state_unlock(sk);
1548
1549                         unix_dgram_disconnected(sk, other);
1550                         sock_put(other);
1551                         err = -ECONNREFUSED;
1552                 } else {
1553                         unix_state_unlock(sk);
1554                 }
1555
1556                 other = NULL;
1557                 if (err)
1558                         goto out_free;
1559                 goto restart;
1560         }
1561
1562         err = -EPIPE;
1563         if (other->sk_shutdown & RCV_SHUTDOWN)
1564                 goto out_unlock;
1565
1566         if (sk->sk_type != SOCK_SEQPACKET) {
1567                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1568                 if (err)
1569                         goto out_unlock;
1570         }
1571
1572         if (unix_peer(other) != sk && unix_recvq_full(other)) {
1573                 if (!timeo) {
1574                         err = -EAGAIN;
1575                         goto out_unlock;
1576                 }
1577
1578                 timeo = unix_wait_for_peer(other, timeo);
1579
1580                 err = sock_intr_errno(timeo);
1581                 if (signal_pending(current))
1582                         goto out_free;
1583
1584                 goto restart;
1585         }
1586
1587         if (sock_flag(other, SOCK_RCVTSTAMP))
1588                 __net_timestamp(skb);
1589         maybe_add_creds(skb, sock, other);
1590         skb_queue_tail(&other->sk_receive_queue, skb);
1591         if (max_level > unix_sk(other)->recursion_level)
1592                 unix_sk(other)->recursion_level = max_level;
1593         unix_state_unlock(other);
1594         other->sk_data_ready(other, len);
1595         sock_put(other);
1596         scm_destroy(siocb->scm);
1597         return len;
1598
1599 out_unlock:
1600         unix_state_unlock(other);
1601 out_free:
1602         kfree_skb(skb);
1603 out:
1604         if (other)
1605                 sock_put(other);
1606         scm_destroy(siocb->scm);
1607         return err;
1608 }
1609
1610 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1611  * bytes, and a minimun of a full page.
1612  */
1613 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1614
1615 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1616                                struct msghdr *msg, size_t len)
1617 {
1618         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1619         struct sock *sk = sock->sk;
1620         struct sock *other = NULL;
1621         int err, size;
1622         struct sk_buff *skb;
1623         int sent = 0;
1624         struct scm_cookie tmp_scm;
1625         bool fds_sent = false;
1626         int max_level;
1627         int data_len;
1628
1629         if (NULL == siocb->scm)
1630                 siocb->scm = &tmp_scm;
1631         wait_for_unix_gc();
1632         err = scm_send(sock, msg, siocb->scm, false);
1633         if (err < 0)
1634                 return err;
1635
1636         err = -EOPNOTSUPP;
1637         if (msg->msg_flags&MSG_OOB)
1638                 goto out_err;
1639
1640         if (msg->msg_namelen) {
1641                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1642                 goto out_err;
1643         } else {
1644                 err = -ENOTCONN;
1645                 other = unix_peer(sk);
1646                 if (!other)
1647                         goto out_err;
1648         }
1649
1650         if (sk->sk_shutdown & SEND_SHUTDOWN)
1651                 goto pipe_err;
1652
1653         while (sent < len) {
1654                 size = len - sent;
1655
1656                 /* Keep two messages in the pipe so it schedules better */
1657                 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1658
1659                 /* allow fallback to order-0 allocations */
1660                 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1661
1662                 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1663
1664                 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1665                                            msg->msg_flags & MSG_DONTWAIT, &err,
1666                                            get_order(UNIX_SKB_FRAGS_SZ));
1667                 if (!skb)
1668                         goto out_err;
1669
1670                 /* Only send the fds in the first buffer */
1671                 err = unix_scm_to_skb(siocb->scm, skb, !fds_sent);
1672                 if (err < 0) {
1673                         kfree_skb(skb);
1674                         goto out_err;
1675                 }
1676                 max_level = err + 1;
1677                 fds_sent = true;
1678
1679                 skb_put(skb, size - data_len);
1680                 skb->data_len = data_len;
1681                 skb->len = size;
1682                 err = skb_copy_datagram_from_iovec(skb, 0, msg->msg_iov,
1683                                                    sent, size);
1684                 if (err) {
1685                         kfree_skb(skb);
1686                         goto out_err;
1687                 }
1688
1689                 unix_state_lock(other);
1690
1691                 if (sock_flag(other, SOCK_DEAD) ||
1692                     (other->sk_shutdown & RCV_SHUTDOWN))
1693                         goto pipe_err_free;
1694
1695                 maybe_add_creds(skb, sock, other);
1696                 skb_queue_tail(&other->sk_receive_queue, skb);
1697                 if (max_level > unix_sk(other)->recursion_level)
1698                         unix_sk(other)->recursion_level = max_level;
1699                 unix_state_unlock(other);
1700                 other->sk_data_ready(other, size);
1701                 sent += size;
1702         }
1703
1704         scm_destroy(siocb->scm);
1705         siocb->scm = NULL;
1706
1707         return sent;
1708
1709 pipe_err_free:
1710         unix_state_unlock(other);
1711         kfree_skb(skb);
1712 pipe_err:
1713         if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1714                 send_sig(SIGPIPE, current, 0);
1715         err = -EPIPE;
1716 out_err:
1717         scm_destroy(siocb->scm);
1718         siocb->scm = NULL;
1719         return sent ? : err;
1720 }
1721
1722 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1723                                   struct msghdr *msg, size_t len)
1724 {
1725         int err;
1726         struct sock *sk = sock->sk;
1727
1728         err = sock_error(sk);
1729         if (err)
1730                 return err;
1731
1732         if (sk->sk_state != TCP_ESTABLISHED)
1733                 return -ENOTCONN;
1734
1735         if (msg->msg_namelen)
1736                 msg->msg_namelen = 0;
1737
1738         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1739 }
1740
1741 static int unix_seqpacket_recvmsg(struct kiocb *iocb, struct socket *sock,
1742                               struct msghdr *msg, size_t size,
1743                               int flags)
1744 {
1745         struct sock *sk = sock->sk;
1746
1747         if (sk->sk_state != TCP_ESTABLISHED)
1748                 return -ENOTCONN;
1749
1750         return unix_dgram_recvmsg(iocb, sock, msg, size, flags);
1751 }
1752
1753 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1754 {
1755         struct unix_sock *u = unix_sk(sk);
1756
1757         msg->msg_namelen = 0;
1758         if (u->addr) {
1759                 msg->msg_namelen = u->addr->len;
1760                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1761         }
1762 }
1763
1764 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1765                               struct msghdr *msg, size_t size,
1766                               int flags)
1767 {
1768         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1769         struct scm_cookie tmp_scm;
1770         struct sock *sk = sock->sk;
1771         struct unix_sock *u = unix_sk(sk);
1772         int noblock = flags & MSG_DONTWAIT;
1773         struct sk_buff *skb;
1774         int err;
1775         int peeked, skip;
1776
1777         err = -EOPNOTSUPP;
1778         if (flags&MSG_OOB)
1779                 goto out;
1780
1781         msg->msg_namelen = 0;
1782
1783         err = mutex_lock_interruptible(&u->readlock);
1784         if (err) {
1785                 err = sock_intr_errno(sock_rcvtimeo(sk, noblock));
1786                 goto out;
1787         }
1788
1789         skip = sk_peek_offset(sk, flags);
1790
1791         skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
1792         if (!skb) {
1793                 unix_state_lock(sk);
1794                 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
1795                 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
1796                     (sk->sk_shutdown & RCV_SHUTDOWN))
1797                         err = 0;
1798                 unix_state_unlock(sk);
1799                 goto out_unlock;
1800         }
1801
1802         wake_up_interruptible_sync_poll(&u->peer_wait,
1803                                         POLLOUT | POLLWRNORM | POLLWRBAND);
1804
1805         if (msg->msg_name)
1806                 unix_copy_addr(msg, skb->sk);
1807
1808         if (size > skb->len - skip)
1809                 size = skb->len - skip;
1810         else if (size < skb->len - skip)
1811                 msg->msg_flags |= MSG_TRUNC;
1812
1813         err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
1814         if (err)
1815                 goto out_free;
1816
1817         if (sock_flag(sk, SOCK_RCVTSTAMP))
1818                 __sock_recv_timestamp(msg, sk, skb);
1819
1820         if (!siocb->scm) {
1821                 siocb->scm = &tmp_scm;
1822                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1823         }
1824         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
1825         unix_set_secdata(siocb->scm, skb);
1826
1827         if (!(flags & MSG_PEEK)) {
1828                 if (UNIXCB(skb).fp)
1829                         unix_detach_fds(siocb->scm, skb);
1830
1831                 sk_peek_offset_bwd(sk, skb->len);
1832         } else {
1833                 /* It is questionable: on PEEK we could:
1834                    - do not return fds - good, but too simple 8)
1835                    - return fds, and do not return them on read (old strategy,
1836                      apparently wrong)
1837                    - clone fds (I chose it for now, it is the most universal
1838                      solution)
1839
1840                    POSIX 1003.1g does not actually define this clearly
1841                    at all. POSIX 1003.1g doesn't define a lot of things
1842                    clearly however!
1843
1844                 */
1845
1846                 sk_peek_offset_fwd(sk, size);
1847
1848                 if (UNIXCB(skb).fp)
1849                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1850         }
1851         err = (flags & MSG_TRUNC) ? skb->len - skip : size;
1852
1853         scm_recv(sock, msg, siocb->scm, flags);
1854
1855 out_free:
1856         skb_free_datagram(sk, skb);
1857 out_unlock:
1858         mutex_unlock(&u->readlock);
1859 out:
1860         return err;
1861 }
1862
1863 /*
1864  *      Sleep until more data has arrived. But check for races..
1865  */
1866 static long unix_stream_data_wait(struct sock *sk, long timeo,
1867                                   struct sk_buff *last)
1868 {
1869         DEFINE_WAIT(wait);
1870
1871         unix_state_lock(sk);
1872
1873         for (;;) {
1874                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
1875
1876                 if (skb_peek_tail(&sk->sk_receive_queue) != last ||
1877                     sk->sk_err ||
1878                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1879                     signal_pending(current) ||
1880                     !timeo)
1881                         break;
1882
1883                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1884                 unix_state_unlock(sk);
1885                 timeo = freezable_schedule_timeout(timeo);
1886                 unix_state_lock(sk);
1887                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1888         }
1889
1890         finish_wait(sk_sleep(sk), &wait);
1891         unix_state_unlock(sk);
1892         return timeo;
1893 }
1894
1895 static unsigned int unix_skb_len(const struct sk_buff *skb)
1896 {
1897         return skb->len - UNIXCB(skb).consumed;
1898 }
1899
1900 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1901                                struct msghdr *msg, size_t size,
1902                                int flags)
1903 {
1904         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1905         struct scm_cookie tmp_scm;
1906         struct sock *sk = sock->sk;
1907         struct unix_sock *u = unix_sk(sk);
1908         struct sockaddr_un *sunaddr = msg->msg_name;
1909         int copied = 0;
1910         int check_creds = 0;
1911         int target;
1912         int err = 0;
1913         long timeo;
1914         int skip;
1915
1916         err = -EINVAL;
1917         if (sk->sk_state != TCP_ESTABLISHED)
1918                 goto out;
1919
1920         err = -EOPNOTSUPP;
1921         if (flags&MSG_OOB)
1922                 goto out;
1923
1924         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1925         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1926
1927         msg->msg_namelen = 0;
1928
1929         /* Lock the socket to prevent queue disordering
1930          * while sleeps in memcpy_tomsg
1931          */
1932
1933         if (!siocb->scm) {
1934                 siocb->scm = &tmp_scm;
1935                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1936         }
1937
1938         err = mutex_lock_interruptible(&u->readlock);
1939         if (err) {
1940                 err = sock_intr_errno(timeo);
1941                 goto out;
1942         }
1943
1944         do {
1945                 int chunk;
1946                 struct sk_buff *skb, *last;
1947
1948                 unix_state_lock(sk);
1949                 last = skb = skb_peek(&sk->sk_receive_queue);
1950 again:
1951                 if (skb == NULL) {
1952                         unix_sk(sk)->recursion_level = 0;
1953                         if (copied >= target)
1954                                 goto unlock;
1955
1956                         /*
1957                          *      POSIX 1003.1g mandates this order.
1958                          */
1959
1960                         err = sock_error(sk);
1961                         if (err)
1962                                 goto unlock;
1963                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1964                                 goto unlock;
1965
1966                         unix_state_unlock(sk);
1967                         err = -EAGAIN;
1968                         if (!timeo)
1969                                 break;
1970                         mutex_unlock(&u->readlock);
1971
1972                         timeo = unix_stream_data_wait(sk, timeo, last);
1973
1974                         if (signal_pending(current)
1975                             ||  mutex_lock_interruptible(&u->readlock)) {
1976                                 err = sock_intr_errno(timeo);
1977                                 goto out;
1978                         }
1979
1980                         continue;
1981  unlock:
1982                         unix_state_unlock(sk);
1983                         break;
1984                 }
1985
1986                 skip = sk_peek_offset(sk, flags);
1987                 while (skip >= unix_skb_len(skb)) {
1988                         skip -= unix_skb_len(skb);
1989                         last = skb;
1990                         skb = skb_peek_next(skb, &sk->sk_receive_queue);
1991                         if (!skb)
1992                                 goto again;
1993                 }
1994
1995                 unix_state_unlock(sk);
1996
1997                 if (check_creds) {
1998                         /* Never glue messages from different writers */
1999                         if ((UNIXCB(skb).pid  != siocb->scm->pid) ||
2000                             !uid_eq(UNIXCB(skb).uid, siocb->scm->creds.uid) ||
2001                             !gid_eq(UNIXCB(skb).gid, siocb->scm->creds.gid))
2002                                 break;
2003                 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2004                         /* Copy credentials */
2005                         scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2006                         check_creds = 1;
2007                 }
2008
2009                 /* Copy address just once */
2010                 if (sunaddr) {
2011                         unix_copy_addr(msg, skb->sk);
2012                         sunaddr = NULL;
2013                 }
2014
2015                 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2016                 if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
2017                                             msg->msg_iov, chunk)) {
2018                         if (copied == 0)
2019                                 copied = -EFAULT;
2020                         break;
2021                 }
2022                 copied += chunk;
2023                 size -= chunk;
2024
2025                 /* Mark read part of skb as used */
2026                 if (!(flags & MSG_PEEK)) {
2027                         UNIXCB(skb).consumed += chunk;
2028
2029                         sk_peek_offset_bwd(sk, chunk);
2030
2031                         if (UNIXCB(skb).fp)
2032                                 unix_detach_fds(siocb->scm, skb);
2033
2034                         if (unix_skb_len(skb))
2035                                 break;
2036
2037                         skb_unlink(skb, &sk->sk_receive_queue);
2038                         consume_skb(skb);
2039
2040                         if (siocb->scm->fp)
2041                                 break;
2042                 } else {
2043                         /* It is questionable, see note in unix_dgram_recvmsg.
2044                          */
2045                         if (UNIXCB(skb).fp)
2046                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
2047
2048                         sk_peek_offset_fwd(sk, chunk);
2049
2050                         break;
2051                 }
2052         } while (size);
2053
2054         mutex_unlock(&u->readlock);
2055         scm_recv(sock, msg, siocb->scm, flags);
2056 out:
2057         return copied ? : err;
2058 }
2059
2060 static int unix_shutdown(struct socket *sock, int mode)
2061 {
2062         struct sock *sk = sock->sk;
2063         struct sock *other;
2064
2065         if (mode < SHUT_RD || mode > SHUT_RDWR)
2066                 return -EINVAL;
2067         /* This maps:
2068          * SHUT_RD   (0) -> RCV_SHUTDOWN  (1)
2069          * SHUT_WR   (1) -> SEND_SHUTDOWN (2)
2070          * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2071          */
2072         ++mode;
2073
2074         unix_state_lock(sk);
2075         sk->sk_shutdown |= mode;
2076         other = unix_peer(sk);
2077         if (other)
2078                 sock_hold(other);
2079         unix_state_unlock(sk);
2080         sk->sk_state_change(sk);
2081
2082         if (other &&
2083                 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2084
2085                 int peer_mode = 0;
2086
2087                 if (mode&RCV_SHUTDOWN)
2088                         peer_mode |= SEND_SHUTDOWN;
2089                 if (mode&SEND_SHUTDOWN)
2090                         peer_mode |= RCV_SHUTDOWN;
2091                 unix_state_lock(other);
2092                 other->sk_shutdown |= peer_mode;
2093                 unix_state_unlock(other);
2094                 other->sk_state_change(other);
2095                 if (peer_mode == SHUTDOWN_MASK)
2096                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2097                 else if (peer_mode & RCV_SHUTDOWN)
2098                         sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2099         }
2100         if (other)
2101                 sock_put(other);
2102
2103         return 0;
2104 }
2105
2106 long unix_inq_len(struct sock *sk)
2107 {
2108         struct sk_buff *skb;
2109         long amount = 0;
2110
2111         if (sk->sk_state == TCP_LISTEN)
2112                 return -EINVAL;
2113
2114         spin_lock(&sk->sk_receive_queue.lock);
2115         if (sk->sk_type == SOCK_STREAM ||
2116             sk->sk_type == SOCK_SEQPACKET) {
2117                 skb_queue_walk(&sk->sk_receive_queue, skb)
2118                         amount += unix_skb_len(skb);
2119         } else {
2120                 skb = skb_peek(&sk->sk_receive_queue);
2121                 if (skb)
2122                         amount = skb->len;
2123         }
2124         spin_unlock(&sk->sk_receive_queue.lock);
2125
2126         return amount;
2127 }
2128 EXPORT_SYMBOL_GPL(unix_inq_len);
2129
2130 long unix_outq_len(struct sock *sk)
2131 {
2132         return sk_wmem_alloc_get(sk);
2133 }
2134 EXPORT_SYMBOL_GPL(unix_outq_len);
2135
2136 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2137 {
2138         struct sock *sk = sock->sk;
2139         long amount = 0;
2140         int err;
2141
2142         switch (cmd) {
2143         case SIOCOUTQ:
2144                 amount = unix_outq_len(sk);
2145                 err = put_user(amount, (int __user *)arg);
2146                 break;
2147         case SIOCINQ:
2148                 amount = unix_inq_len(sk);
2149                 if (amount < 0)
2150                         err = amount;
2151                 else
2152                         err = put_user(amount, (int __user *)arg);
2153                 break;
2154         default:
2155                 err = -ENOIOCTLCMD;
2156                 break;
2157         }
2158         return err;
2159 }
2160
2161 static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2162 {
2163         struct sock *sk = sock->sk;
2164         unsigned int mask;
2165
2166         sock_poll_wait(file, sk_sleep(sk), wait);
2167         mask = 0;
2168
2169         /* exceptional events? */
2170         if (sk->sk_err)
2171                 mask |= POLLERR;
2172         if (sk->sk_shutdown == SHUTDOWN_MASK)
2173                 mask |= POLLHUP;
2174         if (sk->sk_shutdown & RCV_SHUTDOWN)
2175                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2176
2177         /* readable? */
2178         if (!skb_queue_empty(&sk->sk_receive_queue))
2179                 mask |= POLLIN | POLLRDNORM;
2180
2181         /* Connection-based need to check for termination and startup */
2182         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2183             sk->sk_state == TCP_CLOSE)
2184                 mask |= POLLHUP;
2185
2186         /*
2187          * we set writable also when the other side has shut down the
2188          * connection. This prevents stuck sockets.
2189          */
2190         if (unix_writable(sk))
2191                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2192
2193         return mask;
2194 }
2195
2196 static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
2197                                     poll_table *wait)
2198 {
2199         struct sock *sk = sock->sk, *other;
2200         unsigned int mask, writable;
2201
2202         sock_poll_wait(file, sk_sleep(sk), wait);
2203         mask = 0;
2204
2205         /* exceptional events? */
2206         if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue))
2207                 mask |= POLLERR |
2208                         (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? POLLPRI : 0);
2209
2210         if (sk->sk_shutdown & RCV_SHUTDOWN)
2211                 mask |= POLLRDHUP | POLLIN | POLLRDNORM;
2212         if (sk->sk_shutdown == SHUTDOWN_MASK)
2213                 mask |= POLLHUP;
2214
2215         /* readable? */
2216         if (!skb_queue_empty(&sk->sk_receive_queue))
2217                 mask |= POLLIN | POLLRDNORM;
2218
2219         /* Connection-based need to check for termination and startup */
2220         if (sk->sk_type == SOCK_SEQPACKET) {
2221                 if (sk->sk_state == TCP_CLOSE)
2222                         mask |= POLLHUP;
2223                 /* connection hasn't started yet? */
2224                 if (sk->sk_state == TCP_SYN_SENT)
2225                         return mask;
2226         }
2227
2228         /* No write status requested, avoid expensive OUT tests. */
2229         if (!(poll_requested_events(wait) & (POLLWRBAND|POLLWRNORM|POLLOUT)))
2230                 return mask;
2231
2232         writable = unix_writable(sk);
2233         other = unix_peer_get(sk);
2234         if (other) {
2235                 if (unix_peer(other) != sk) {
2236                         sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
2237                         if (unix_recvq_full(other))
2238                                 writable = 0;
2239                 }
2240                 sock_put(other);
2241         }
2242
2243         if (writable)
2244                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
2245         else
2246                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
2247
2248         return mask;
2249 }
2250
2251 #ifdef CONFIG_PROC_FS
2252
2253 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2254
2255 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2256 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2257 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2258
2259 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2260 {
2261         unsigned long offset = get_offset(*pos);
2262         unsigned long bucket = get_bucket(*pos);
2263         struct sock *sk;
2264         unsigned long count = 0;
2265
2266         for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2267                 if (sock_net(sk) != seq_file_net(seq))
2268                         continue;
2269                 if (++count == offset)
2270                         break;
2271         }
2272
2273         return sk;
2274 }
2275
2276 static struct sock *unix_next_socket(struct seq_file *seq,
2277                                      struct sock *sk,
2278                                      loff_t *pos)
2279 {
2280         unsigned long bucket;
2281
2282         while (sk > (struct sock *)SEQ_START_TOKEN) {
2283                 sk = sk_next(sk);
2284                 if (!sk)
2285                         goto next_bucket;
2286                 if (sock_net(sk) == seq_file_net(seq))
2287                         return sk;
2288         }
2289
2290         do {
2291                 sk = unix_from_bucket(seq, pos);
2292                 if (sk)
2293                         return sk;
2294
2295 next_bucket:
2296                 bucket = get_bucket(*pos) + 1;
2297                 *pos = set_bucket_offset(bucket, 1);
2298         } while (bucket < ARRAY_SIZE(unix_socket_table));
2299
2300         return NULL;
2301 }
2302
2303 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2304         __acquires(unix_table_lock)
2305 {
2306         spin_lock(&unix_table_lock);
2307
2308         if (!*pos)
2309                 return SEQ_START_TOKEN;
2310
2311         if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2312                 return NULL;
2313
2314         return unix_next_socket(seq, NULL, pos);
2315 }
2316
2317 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2318 {
2319         ++*pos;
2320         return unix_next_socket(seq, v, pos);
2321 }
2322
2323 static void unix_seq_stop(struct seq_file *seq, void *v)
2324         __releases(unix_table_lock)
2325 {
2326         spin_unlock(&unix_table_lock);
2327 }
2328
2329 static int unix_seq_show(struct seq_file *seq, void *v)
2330 {
2331
2332         if (v == SEQ_START_TOKEN)
2333                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
2334                          "Inode Path\n");
2335         else {
2336                 struct sock *s = v;
2337                 struct unix_sock *u = unix_sk(s);
2338                 unix_state_lock(s);
2339
2340                 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2341                         s,
2342                         atomic_read(&s->sk_refcnt),
2343                         0,
2344                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2345                         s->sk_type,
2346                         s->sk_socket ?
2347                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2348                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2349                         sock_i_ino(s));
2350
2351                 if (u->addr) {
2352                         int i, len;
2353                         seq_putc(seq, ' ');
2354
2355                         i = 0;
2356                         len = u->addr->len - sizeof(short);
2357                         if (!UNIX_ABSTRACT(s))
2358                                 len--;
2359                         else {
2360                                 seq_putc(seq, '@');
2361                                 i++;
2362                         }
2363                         for ( ; i < len; i++)
2364                                 seq_putc(seq, u->addr->name->sun_path[i]);
2365                 }
2366                 unix_state_unlock(s);
2367                 seq_putc(seq, '\n');
2368         }
2369
2370         return 0;
2371 }
2372
2373 static const struct seq_operations unix_seq_ops = {
2374         .start  = unix_seq_start,
2375         .next   = unix_seq_next,
2376         .stop   = unix_seq_stop,
2377         .show   = unix_seq_show,
2378 };
2379
2380 static int unix_seq_open(struct inode *inode, struct file *file)
2381 {
2382         return seq_open_net(inode, file, &unix_seq_ops,
2383                             sizeof(struct seq_net_private));
2384 }
2385
2386 static const struct file_operations unix_seq_fops = {
2387         .owner          = THIS_MODULE,
2388         .open           = unix_seq_open,
2389         .read           = seq_read,
2390         .llseek         = seq_lseek,
2391         .release        = seq_release_net,
2392 };
2393
2394 #endif
2395
2396 static const struct net_proto_family unix_family_ops = {
2397         .family = PF_UNIX,
2398         .create = unix_create,
2399         .owner  = THIS_MODULE,
2400 };
2401
2402
2403 static int __net_init unix_net_init(struct net *net)
2404 {
2405         int error = -ENOMEM;
2406
2407         net->unx.sysctl_max_dgram_qlen = 10;
2408         if (unix_sysctl_register(net))
2409                 goto out;
2410
2411 #ifdef CONFIG_PROC_FS
2412         if (!proc_create("unix", 0, net->proc_net, &unix_seq_fops)) {
2413                 unix_sysctl_unregister(net);
2414                 goto out;
2415         }
2416 #endif
2417         error = 0;
2418 out:
2419         return error;
2420 }
2421
2422 static void __net_exit unix_net_exit(struct net *net)
2423 {
2424         unix_sysctl_unregister(net);
2425         remove_proc_entry("unix", net->proc_net);
2426 }
2427
2428 static struct pernet_operations unix_net_ops = {
2429         .init = unix_net_init,
2430         .exit = unix_net_exit,
2431 };
2432
2433 static int __init af_unix_init(void)
2434 {
2435         int rc = -1;
2436
2437         BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2438
2439         rc = proto_register(&unix_proto, 1);
2440         if (rc != 0) {
2441                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2442                        __func__);
2443                 goto out;
2444         }
2445
2446         sock_register(&unix_family_ops);
2447         register_pernet_subsys(&unix_net_ops);
2448 out:
2449         return rc;
2450 }
2451
2452 static void __exit af_unix_exit(void)
2453 {
2454         sock_unregister(PF_UNIX);
2455         proto_unregister(&unix_proto);
2456         unregister_pernet_subsys(&unix_net_ops);
2457 }
2458
2459 /* Earlier than device_initcall() so that other drivers invoking
2460    request_module() don't end up in a loop when modprobe tries
2461    to use a UNIX socket. But later than subsys_initcall() because
2462    we depend on stuff initialised there */
2463 fs_initcall(af_unix_init);
2464 module_exit(af_unix_exit);
2465
2466 MODULE_LICENSE("GPL");
2467 MODULE_ALIAS_NETPROTO(PF_UNIX);