1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
97 #include <linux/slab.h>
98 #include <linux/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/net_namespace.h>
102 #include <net/sock.h>
103 #include <net/tcp_states.h>
104 #include <net/af_unix.h>
105 #include <linux/proc_fs.h>
106 #include <linux/seq_file.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/rtnetlink.h>
111 #include <linux/mount.h>
112 #include <net/checksum.h>
113 #include <linux/security.h>
114 #include <linux/freezer.h>
115 #include <linux/file.h>
119 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
120 EXPORT_SYMBOL_GPL(unix_socket_table);
121 DEFINE_SPINLOCK(unix_table_lock);
122 EXPORT_SYMBOL_GPL(unix_table_lock);
123 static atomic_long_t unix_nr_socks;
126 static struct hlist_head *unix_sockets_unbound(void *addr)
128 unsigned long hash = (unsigned long)addr;
132 hash %= UNIX_HASH_SIZE;
133 return &unix_socket_table[UNIX_HASH_SIZE + hash];
136 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
138 #ifdef CONFIG_SECURITY_NETWORK
139 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
141 UNIXCB(skb).secid = scm->secid;
144 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
146 scm->secid = UNIXCB(skb).secid;
149 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
151 return (scm->secid == UNIXCB(skb).secid);
154 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
157 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
160 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
164 #endif /* CONFIG_SECURITY_NETWORK */
167 * SMP locking strategy:
168 * hash table is protected with spinlock unix_table_lock
169 * each socket state is protected by separate spin lock.
172 static inline unsigned int unix_hash_fold(__wsum n)
174 unsigned int hash = (__force unsigned int)csum_fold(n);
177 return hash&(UNIX_HASH_SIZE-1);
180 #define unix_peer(sk) (unix_sk(sk)->peer)
182 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
184 return unix_peer(osk) == sk;
187 static inline int unix_may_send(struct sock *sk, struct sock *osk)
189 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
192 static inline int unix_recvq_full(struct sock const *sk)
194 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
197 struct sock *unix_peer_get(struct sock *s)
205 unix_state_unlock(s);
208 EXPORT_SYMBOL_GPL(unix_peer_get);
210 static inline void unix_release_addr(struct unix_address *addr)
212 if (refcount_dec_and_test(&addr->refcnt))
217 * Check unix socket name:
218 * - should be not zero length.
219 * - if started by not zero, should be NULL terminated (FS object)
220 * - if started by zero, it is abstract name.
223 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
227 if (len <= sizeof(short) || len > sizeof(*sunaddr))
229 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
231 if (sunaddr->sun_path[0]) {
233 * This may look like an off by one error but it is a bit more
234 * subtle. 108 is the longest valid AF_UNIX path for a binding.
235 * sun_path[108] doesn't as such exist. However in kernel space
236 * we are guaranteed that it is a valid memory location in our
237 * kernel address buffer.
239 ((char *)sunaddr)[len] = 0;
240 len = strlen(sunaddr->sun_path)+1+sizeof(short);
244 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
248 static void __unix_remove_socket(struct sock *sk)
250 sk_del_node_init(sk);
253 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
255 WARN_ON(!sk_unhashed(sk));
256 sk_add_node(sk, list);
259 static inline void unix_remove_socket(struct sock *sk)
261 spin_lock(&unix_table_lock);
262 __unix_remove_socket(sk);
263 spin_unlock(&unix_table_lock);
266 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
268 spin_lock(&unix_table_lock);
269 __unix_insert_socket(list, sk);
270 spin_unlock(&unix_table_lock);
273 static struct sock *__unix_find_socket_byname(struct net *net,
274 struct sockaddr_un *sunname,
275 int len, int type, unsigned int hash)
279 sk_for_each(s, &unix_socket_table[hash ^ type]) {
280 struct unix_sock *u = unix_sk(s);
282 if (!net_eq(sock_net(s), net))
285 if (u->addr->len == len &&
286 !memcmp(u->addr->name, sunname, len))
292 static inline struct sock *unix_find_socket_byname(struct net *net,
293 struct sockaddr_un *sunname,
299 spin_lock(&unix_table_lock);
300 s = __unix_find_socket_byname(net, sunname, len, type, hash);
303 spin_unlock(&unix_table_lock);
307 static struct sock *unix_find_socket_byinode(struct inode *i)
311 spin_lock(&unix_table_lock);
313 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
314 struct dentry *dentry = unix_sk(s)->path.dentry;
316 if (dentry && d_backing_inode(dentry) == i) {
323 spin_unlock(&unix_table_lock);
327 /* Support code for asymmetrically connected dgram sockets
329 * If a datagram socket is connected to a socket not itself connected
330 * to the first socket (eg, /dev/log), clients may only enqueue more
331 * messages if the present receive queue of the server socket is not
332 * "too large". This means there's a second writeability condition
333 * poll and sendmsg need to test. The dgram recv code will do a wake
334 * up on the peer_wait wait queue of a socket upon reception of a
335 * datagram which needs to be propagated to sleeping would-be writers
336 * since these might not have sent anything so far. This can't be
337 * accomplished via poll_wait because the lifetime of the server
338 * socket might be less than that of its clients if these break their
339 * association with it or if the server socket is closed while clients
340 * are still connected to it and there's no way to inform "a polling
341 * implementation" that it should let go of a certain wait queue
343 * In order to propagate a wake up, a wait_queue_entry_t of the client
344 * socket is enqueued on the peer_wait queue of the server socket
345 * whose wake function does a wake_up on the ordinary client socket
346 * wait queue. This connection is established whenever a write (or
347 * poll for write) hit the flow control condition and broken when the
348 * association to the server socket is dissolved or after a wake up
352 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
356 wait_queue_head_t *u_sleep;
358 u = container_of(q, struct unix_sock, peer_wake);
360 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
362 u->peer_wake.private = NULL;
364 /* relaying can only happen while the wq still exists */
365 u_sleep = sk_sleep(&u->sk);
367 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
372 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
374 struct unix_sock *u, *u_other;
378 u_other = unix_sk(other);
380 spin_lock(&u_other->peer_wait.lock);
382 if (!u->peer_wake.private) {
383 u->peer_wake.private = other;
384 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
389 spin_unlock(&u_other->peer_wait.lock);
393 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
396 struct unix_sock *u, *u_other;
399 u_other = unix_sk(other);
400 spin_lock(&u_other->peer_wait.lock);
402 if (u->peer_wake.private == other) {
403 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
404 u->peer_wake.private = NULL;
407 spin_unlock(&u_other->peer_wait.lock);
410 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
413 unix_dgram_peer_wake_disconnect(sk, other);
414 wake_up_interruptible_poll(sk_sleep(sk),
421 * - unix_peer(sk) == other
422 * - association is stable
424 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
428 connected = unix_dgram_peer_wake_connect(sk, other);
430 /* If other is SOCK_DEAD, we want to make sure we signal
431 * POLLOUT, such that a subsequent write() can get a
432 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
433 * to other and its full, we will hang waiting for POLLOUT.
435 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
439 unix_dgram_peer_wake_disconnect(sk, other);
444 static int unix_writable(const struct sock *sk)
446 return sk->sk_state != TCP_LISTEN &&
447 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
450 static void unix_write_space(struct sock *sk)
452 struct socket_wq *wq;
455 if (unix_writable(sk)) {
456 wq = rcu_dereference(sk->sk_wq);
457 if (skwq_has_sleeper(wq))
458 wake_up_interruptible_sync_poll(&wq->wait,
459 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
460 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
465 /* When dgram socket disconnects (or changes its peer), we clear its receive
466 * queue of packets arrived from previous peer. First, it allows to do
467 * flow control based only on wmem_alloc; second, sk connected to peer
468 * may receive messages only from that peer. */
469 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
471 if (!skb_queue_empty(&sk->sk_receive_queue)) {
472 skb_queue_purge(&sk->sk_receive_queue);
473 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
475 /* If one link of bidirectional dgram pipe is disconnected,
476 * we signal error. Messages are lost. Do not make this,
477 * when peer was not connected to us.
479 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
480 other->sk_err = ECONNRESET;
481 other->sk_error_report(other);
486 static void unix_sock_destructor(struct sock *sk)
488 struct unix_sock *u = unix_sk(sk);
490 skb_queue_purge(&sk->sk_receive_queue);
492 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
493 WARN_ON(!sk_unhashed(sk));
494 WARN_ON(sk->sk_socket);
495 if (!sock_flag(sk, SOCK_DEAD)) {
496 pr_info("Attempt to release alive unix socket: %p\n", sk);
501 unix_release_addr(u->addr);
503 atomic_long_dec(&unix_nr_socks);
505 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
507 #ifdef UNIX_REFCNT_DEBUG
508 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
509 atomic_long_read(&unix_nr_socks));
513 static void unix_release_sock(struct sock *sk, int embrion)
515 struct unix_sock *u = unix_sk(sk);
521 unix_remove_socket(sk);
526 sk->sk_shutdown = SHUTDOWN_MASK;
528 u->path.dentry = NULL;
530 state = sk->sk_state;
531 sk->sk_state = TCP_CLOSE;
532 unix_state_unlock(sk);
534 wake_up_interruptible_all(&u->peer_wait);
536 skpair = unix_peer(sk);
538 if (skpair != NULL) {
539 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
540 unix_state_lock(skpair);
542 skpair->sk_shutdown = SHUTDOWN_MASK;
543 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
544 skpair->sk_err = ECONNRESET;
545 unix_state_unlock(skpair);
546 skpair->sk_state_change(skpair);
547 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
550 unix_dgram_peer_wake_disconnect(sk, skpair);
551 sock_put(skpair); /* It may now die */
552 unix_peer(sk) = NULL;
555 /* Try to flush out this socket. Throw out buffers at least */
557 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
558 if (state == TCP_LISTEN)
559 unix_release_sock(skb->sk, 1);
560 /* passed fds are erased in the kfree_skb hook */
561 UNIXCB(skb).consumed = skb->len;
570 /* ---- Socket is dead now and most probably destroyed ---- */
573 * Fixme: BSD difference: In BSD all sockets connected to us get
574 * ECONNRESET and we die on the spot. In Linux we behave
575 * like files and pipes do and wait for the last
578 * Can't we simply set sock->err?
580 * What the above comment does talk about? --ANK(980817)
583 if (unix_tot_inflight)
584 unix_gc(); /* Garbage collect fds */
587 static void init_peercred(struct sock *sk)
589 put_pid(sk->sk_peer_pid);
590 if (sk->sk_peer_cred)
591 put_cred(sk->sk_peer_cred);
592 sk->sk_peer_pid = get_pid(task_tgid(current));
593 sk->sk_peer_cred = get_current_cred();
596 static void copy_peercred(struct sock *sk, struct sock *peersk)
598 put_pid(sk->sk_peer_pid);
599 if (sk->sk_peer_cred)
600 put_cred(sk->sk_peer_cred);
601 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
602 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
605 static int unix_listen(struct socket *sock, int backlog)
608 struct sock *sk = sock->sk;
609 struct unix_sock *u = unix_sk(sk);
610 struct pid *old_pid = NULL;
613 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
614 goto out; /* Only stream/seqpacket sockets accept */
617 goto out; /* No listens on an unbound socket */
619 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
621 if (backlog > sk->sk_max_ack_backlog)
622 wake_up_interruptible_all(&u->peer_wait);
623 sk->sk_max_ack_backlog = backlog;
624 sk->sk_state = TCP_LISTEN;
625 /* set credentials so connect can copy them */
630 unix_state_unlock(sk);
636 static int unix_release(struct socket *);
637 static int unix_bind(struct socket *, struct sockaddr *, int);
638 static int unix_stream_connect(struct socket *, struct sockaddr *,
639 int addr_len, int flags);
640 static int unix_socketpair(struct socket *, struct socket *);
641 static int unix_accept(struct socket *, struct socket *, int, bool);
642 static int unix_getname(struct socket *, struct sockaddr *, int);
643 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
644 static __poll_t unix_dgram_poll(struct file *, struct socket *,
646 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
648 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
650 static int unix_shutdown(struct socket *, int);
651 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
652 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
653 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
654 size_t size, int flags);
655 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
656 struct pipe_inode_info *, size_t size,
658 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
659 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
660 static int unix_dgram_connect(struct socket *, struct sockaddr *,
662 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
663 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
666 static int unix_set_peek_off(struct sock *sk, int val)
668 struct unix_sock *u = unix_sk(sk);
670 if (mutex_lock_interruptible(&u->iolock))
673 sk->sk_peek_off = val;
674 mutex_unlock(&u->iolock);
680 static const struct proto_ops unix_stream_ops = {
682 .owner = THIS_MODULE,
683 .release = unix_release,
685 .connect = unix_stream_connect,
686 .socketpair = unix_socketpair,
687 .accept = unix_accept,
688 .getname = unix_getname,
692 .compat_ioctl = unix_compat_ioctl,
694 .listen = unix_listen,
695 .shutdown = unix_shutdown,
696 .setsockopt = sock_no_setsockopt,
697 .getsockopt = sock_no_getsockopt,
698 .sendmsg = unix_stream_sendmsg,
699 .recvmsg = unix_stream_recvmsg,
700 .mmap = sock_no_mmap,
701 .sendpage = unix_stream_sendpage,
702 .splice_read = unix_stream_splice_read,
703 .set_peek_off = unix_set_peek_off,
706 static const struct proto_ops unix_dgram_ops = {
708 .owner = THIS_MODULE,
709 .release = unix_release,
711 .connect = unix_dgram_connect,
712 .socketpair = unix_socketpair,
713 .accept = sock_no_accept,
714 .getname = unix_getname,
715 .poll = unix_dgram_poll,
718 .compat_ioctl = unix_compat_ioctl,
720 .listen = sock_no_listen,
721 .shutdown = unix_shutdown,
722 .setsockopt = sock_no_setsockopt,
723 .getsockopt = sock_no_getsockopt,
724 .sendmsg = unix_dgram_sendmsg,
725 .recvmsg = unix_dgram_recvmsg,
726 .mmap = sock_no_mmap,
727 .sendpage = sock_no_sendpage,
728 .set_peek_off = unix_set_peek_off,
731 static const struct proto_ops unix_seqpacket_ops = {
733 .owner = THIS_MODULE,
734 .release = unix_release,
736 .connect = unix_stream_connect,
737 .socketpair = unix_socketpair,
738 .accept = unix_accept,
739 .getname = unix_getname,
740 .poll = unix_dgram_poll,
743 .compat_ioctl = unix_compat_ioctl,
745 .listen = unix_listen,
746 .shutdown = unix_shutdown,
747 .setsockopt = sock_no_setsockopt,
748 .getsockopt = sock_no_getsockopt,
749 .sendmsg = unix_seqpacket_sendmsg,
750 .recvmsg = unix_seqpacket_recvmsg,
751 .mmap = sock_no_mmap,
752 .sendpage = sock_no_sendpage,
753 .set_peek_off = unix_set_peek_off,
756 static struct proto unix_proto = {
758 .owner = THIS_MODULE,
759 .obj_size = sizeof(struct unix_sock),
762 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
764 struct sock *sk = NULL;
767 atomic_long_inc(&unix_nr_socks);
768 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
771 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto, kern);
775 sock_init_data(sock, sk);
777 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
778 sk->sk_write_space = unix_write_space;
779 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
780 sk->sk_destruct = unix_sock_destructor;
782 u->path.dentry = NULL;
784 spin_lock_init(&u->lock);
785 atomic_long_set(&u->inflight, 0);
786 INIT_LIST_HEAD(&u->link);
787 mutex_init(&u->iolock); /* single task reading lock */
788 mutex_init(&u->bindlock); /* single task binding lock */
789 init_waitqueue_head(&u->peer_wait);
790 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
791 unix_insert_socket(unix_sockets_unbound(sk), sk);
794 atomic_long_dec(&unix_nr_socks);
797 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
803 static int unix_create(struct net *net, struct socket *sock, int protocol,
806 if (protocol && protocol != PF_UNIX)
807 return -EPROTONOSUPPORT;
809 sock->state = SS_UNCONNECTED;
811 switch (sock->type) {
813 sock->ops = &unix_stream_ops;
816 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
820 sock->type = SOCK_DGRAM;
823 sock->ops = &unix_dgram_ops;
826 sock->ops = &unix_seqpacket_ops;
829 return -ESOCKTNOSUPPORT;
832 return unix_create1(net, sock, kern) ? 0 : -ENOMEM;
835 static int unix_release(struct socket *sock)
837 struct sock *sk = sock->sk;
842 unix_release_sock(sk, 0);
848 static int unix_autobind(struct socket *sock)
850 struct sock *sk = sock->sk;
851 struct net *net = sock_net(sk);
852 struct unix_sock *u = unix_sk(sk);
853 static u32 ordernum = 1;
854 struct unix_address *addr;
856 unsigned int retries = 0;
858 err = mutex_lock_interruptible(&u->bindlock);
867 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
871 addr->name->sun_family = AF_UNIX;
872 refcount_set(&addr->refcnt, 1);
875 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
876 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
878 spin_lock(&unix_table_lock);
879 ordernum = (ordernum+1)&0xFFFFF;
881 if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type,
883 spin_unlock(&unix_table_lock);
885 * __unix_find_socket_byname() may take long time if many names
886 * are already in use.
889 /* Give up if all names seems to be in use. */
890 if (retries++ == 0xFFFFF) {
897 addr->hash ^= sk->sk_type;
899 __unix_remove_socket(sk);
900 smp_store_release(&u->addr, addr);
901 __unix_insert_socket(&unix_socket_table[addr->hash], sk);
902 spin_unlock(&unix_table_lock);
905 out: mutex_unlock(&u->bindlock);
909 static struct sock *unix_find_other(struct net *net,
910 struct sockaddr_un *sunname, int len,
911 int type, unsigned int hash, int *error)
917 if (sunname->sun_path[0]) {
919 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
922 inode = d_backing_inode(path.dentry);
923 err = inode_permission(inode, MAY_WRITE);
928 if (!S_ISSOCK(inode->i_mode))
930 u = unix_find_socket_byinode(inode);
934 if (u->sk_type == type)
940 if (u->sk_type != type) {
946 u = unix_find_socket_byname(net, sunname, len, type, hash);
948 struct dentry *dentry;
949 dentry = unix_sk(u)->path.dentry;
951 touch_atime(&unix_sk(u)->path);
964 static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
966 struct dentry *dentry;
970 * Get the parent directory, calculate the hash for last
973 dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
974 err = PTR_ERR(dentry);
979 * All right, let's create it.
981 err = security_path_mknod(&path, dentry, mode, 0);
983 err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
985 res->mnt = mntget(path.mnt);
986 res->dentry = dget(dentry);
989 done_path_create(&path, dentry);
993 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
995 struct sock *sk = sock->sk;
996 struct net *net = sock_net(sk);
997 struct unix_sock *u = unix_sk(sk);
998 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
999 char *sun_path = sunaddr->sun_path;
1002 struct unix_address *addr;
1003 struct hlist_head *list;
1004 struct path path = { };
1007 if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1008 sunaddr->sun_family != AF_UNIX)
1011 if (addr_len == sizeof(short)) {
1012 err = unix_autobind(sock);
1016 err = unix_mkname(sunaddr, addr_len, &hash);
1022 umode_t mode = S_IFSOCK |
1023 (SOCK_INODE(sock)->i_mode & ~current_umask());
1024 err = unix_mknod(sun_path, mode, &path);
1032 err = mutex_lock_interruptible(&u->bindlock);
1041 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1045 memcpy(addr->name, sunaddr, addr_len);
1046 addr->len = addr_len;
1047 addr->hash = hash ^ sk->sk_type;
1048 refcount_set(&addr->refcnt, 1);
1051 addr->hash = UNIX_HASH_SIZE;
1052 hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1053 spin_lock(&unix_table_lock);
1055 list = &unix_socket_table[hash];
1057 spin_lock(&unix_table_lock);
1059 if (__unix_find_socket_byname(net, sunaddr, addr_len,
1060 sk->sk_type, hash)) {
1061 unix_release_addr(addr);
1065 list = &unix_socket_table[addr->hash];
1069 __unix_remove_socket(sk);
1070 smp_store_release(&u->addr, addr);
1071 __unix_insert_socket(list, sk);
1074 spin_unlock(&unix_table_lock);
1076 mutex_unlock(&u->bindlock);
1084 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1086 if (unlikely(sk1 == sk2) || !sk2) {
1087 unix_state_lock(sk1);
1091 unix_state_lock(sk1);
1092 unix_state_lock_nested(sk2);
1094 unix_state_lock(sk2);
1095 unix_state_lock_nested(sk1);
1099 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1101 if (unlikely(sk1 == sk2) || !sk2) {
1102 unix_state_unlock(sk1);
1105 unix_state_unlock(sk1);
1106 unix_state_unlock(sk2);
1109 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1110 int alen, int flags)
1112 struct sock *sk = sock->sk;
1113 struct net *net = sock_net(sk);
1114 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1120 if (alen < offsetofend(struct sockaddr, sa_family))
1123 if (addr->sa_family != AF_UNSPEC) {
1124 err = unix_mkname(sunaddr, alen, &hash);
1129 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1130 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1134 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1138 unix_state_double_lock(sk, other);
1140 /* Apparently VFS overslept socket death. Retry. */
1141 if (sock_flag(other, SOCK_DEAD)) {
1142 unix_state_double_unlock(sk, other);
1148 if (!unix_may_send(sk, other))
1151 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1157 * 1003.1g breaking connected state with AF_UNSPEC
1160 unix_state_double_lock(sk, other);
1164 * If it was connected, reconnect.
1166 if (unix_peer(sk)) {
1167 struct sock *old_peer = unix_peer(sk);
1168 unix_peer(sk) = other;
1169 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1171 unix_state_double_unlock(sk, other);
1173 if (other != old_peer)
1174 unix_dgram_disconnected(sk, old_peer);
1177 unix_peer(sk) = other;
1178 unix_state_double_unlock(sk, other);
1183 unix_state_double_unlock(sk, other);
1189 static long unix_wait_for_peer(struct sock *other, long timeo)
1191 struct unix_sock *u = unix_sk(other);
1195 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1197 sched = !sock_flag(other, SOCK_DEAD) &&
1198 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1199 unix_recvq_full(other);
1201 unix_state_unlock(other);
1204 timeo = schedule_timeout(timeo);
1206 finish_wait(&u->peer_wait, &wait);
1210 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1211 int addr_len, int flags)
1213 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1214 struct sock *sk = sock->sk;
1215 struct net *net = sock_net(sk);
1216 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1217 struct sock *newsk = NULL;
1218 struct sock *other = NULL;
1219 struct sk_buff *skb = NULL;
1225 err = unix_mkname(sunaddr, addr_len, &hash);
1230 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1231 (err = unix_autobind(sock)) != 0)
1234 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1236 /* First of all allocate resources.
1237 If we will make it after state is locked,
1238 we will have to recheck all again in any case.
1243 /* create new sock for complete connection */
1244 newsk = unix_create1(sock_net(sk), NULL, 0);
1248 /* Allocate skb for sending to listening sock */
1249 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1254 /* Find listening sock. */
1255 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1259 /* Latch state of peer */
1260 unix_state_lock(other);
1262 /* Apparently VFS overslept socket death. Retry. */
1263 if (sock_flag(other, SOCK_DEAD)) {
1264 unix_state_unlock(other);
1269 err = -ECONNREFUSED;
1270 if (other->sk_state != TCP_LISTEN)
1272 if (other->sk_shutdown & RCV_SHUTDOWN)
1275 if (unix_recvq_full(other)) {
1280 timeo = unix_wait_for_peer(other, timeo);
1282 err = sock_intr_errno(timeo);
1283 if (signal_pending(current))
1291 It is tricky place. We need to grab our state lock and cannot
1292 drop lock on peer. It is dangerous because deadlock is
1293 possible. Connect to self case and simultaneous
1294 attempt to connect are eliminated by checking socket
1295 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1296 check this before attempt to grab lock.
1298 Well, and we have to recheck the state after socket locked.
1304 /* This is ok... continue with connect */
1306 case TCP_ESTABLISHED:
1307 /* Socket is already connected */
1315 unix_state_lock_nested(sk);
1317 if (sk->sk_state != st) {
1318 unix_state_unlock(sk);
1319 unix_state_unlock(other);
1324 err = security_unix_stream_connect(sk, other, newsk);
1326 unix_state_unlock(sk);
1330 /* The way is open! Fastly set all the necessary fields... */
1333 unix_peer(newsk) = sk;
1334 newsk->sk_state = TCP_ESTABLISHED;
1335 newsk->sk_type = sk->sk_type;
1336 init_peercred(newsk);
1337 newu = unix_sk(newsk);
1338 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1339 otheru = unix_sk(other);
1341 /* copy address information from listening to new sock
1343 * The contents of *(otheru->addr) and otheru->path
1344 * are seen fully set up here, since we have found
1345 * otheru in hash under unix_table_lock. Insertion
1346 * into the hash chain we'd found it in had been done
1347 * in an earlier critical area protected by unix_table_lock,
1348 * the same one where we'd set *(otheru->addr) contents,
1349 * as well as otheru->path and otheru->addr itself.
1351 * Using smp_store_release() here to set newu->addr
1352 * is enough to make those stores, as well as stores
1353 * to newu->path visible to anyone who gets newu->addr
1354 * by smp_load_acquire(). IOW, the same warranties
1355 * as for unix_sock instances bound in unix_bind() or
1356 * in unix_autobind().
1358 if (otheru->path.dentry) {
1359 path_get(&otheru->path);
1360 newu->path = otheru->path;
1362 refcount_inc(&otheru->addr->refcnt);
1363 smp_store_release(&newu->addr, otheru->addr);
1365 /* Set credentials */
1366 copy_peercred(sk, other);
1368 sock->state = SS_CONNECTED;
1369 sk->sk_state = TCP_ESTABLISHED;
1372 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1373 unix_peer(sk) = newsk;
1375 unix_state_unlock(sk);
1377 /* take ten and and send info to listening sock */
1378 spin_lock(&other->sk_receive_queue.lock);
1379 __skb_queue_tail(&other->sk_receive_queue, skb);
1380 spin_unlock(&other->sk_receive_queue.lock);
1381 unix_state_unlock(other);
1382 other->sk_data_ready(other);
1388 unix_state_unlock(other);
1393 unix_release_sock(newsk, 0);
1399 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1401 struct sock *ska = socka->sk, *skb = sockb->sk;
1403 /* Join our sockets back to back */
1406 unix_peer(ska) = skb;
1407 unix_peer(skb) = ska;
1411 if (ska->sk_type != SOCK_DGRAM) {
1412 ska->sk_state = TCP_ESTABLISHED;
1413 skb->sk_state = TCP_ESTABLISHED;
1414 socka->state = SS_CONNECTED;
1415 sockb->state = SS_CONNECTED;
1420 static void unix_sock_inherit_flags(const struct socket *old,
1423 if (test_bit(SOCK_PASSCRED, &old->flags))
1424 set_bit(SOCK_PASSCRED, &new->flags);
1425 if (test_bit(SOCK_PASSSEC, &old->flags))
1426 set_bit(SOCK_PASSSEC, &new->flags);
1429 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1432 struct sock *sk = sock->sk;
1434 struct sk_buff *skb;
1438 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1442 if (sk->sk_state != TCP_LISTEN)
1445 /* If socket state is TCP_LISTEN it cannot change (for now...),
1446 * so that no locks are necessary.
1449 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1451 /* This means receive shutdown. */
1458 skb_free_datagram(sk, skb);
1459 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1461 /* attach accepted sock to socket */
1462 unix_state_lock(tsk);
1463 newsock->state = SS_CONNECTED;
1464 unix_sock_inherit_flags(sock, newsock);
1465 sock_graft(tsk, newsock);
1466 unix_state_unlock(tsk);
1474 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1476 struct sock *sk = sock->sk;
1477 struct unix_address *addr;
1478 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1482 sk = unix_peer_get(sk);
1492 addr = smp_load_acquire(&unix_sk(sk)->addr);
1494 sunaddr->sun_family = AF_UNIX;
1495 sunaddr->sun_path[0] = 0;
1496 err = sizeof(short);
1499 memcpy(sunaddr, addr->name, addr->len);
1506 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1510 UNIXCB(skb).pid = get_pid(scm->pid);
1511 UNIXCB(skb).uid = scm->creds.uid;
1512 UNIXCB(skb).gid = scm->creds.gid;
1513 UNIXCB(skb).fp = NULL;
1514 unix_get_secdata(scm, skb);
1515 if (scm->fp && send_fds)
1516 err = unix_attach_fds(scm, skb);
1518 skb->destructor = unix_destruct_scm;
1522 static bool unix_passcred_enabled(const struct socket *sock,
1523 const struct sock *other)
1525 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1526 !other->sk_socket ||
1527 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1531 * Some apps rely on write() giving SCM_CREDENTIALS
1532 * We include credentials if source or destination socket
1533 * asserted SOCK_PASSCRED.
1535 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1536 const struct sock *other)
1538 if (UNIXCB(skb).pid)
1540 if (unix_passcred_enabled(sock, other)) {
1541 UNIXCB(skb).pid = get_pid(task_tgid(current));
1542 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1546 static int maybe_init_creds(struct scm_cookie *scm,
1547 struct socket *socket,
1548 const struct sock *other)
1551 struct msghdr msg = { .msg_controllen = 0 };
1553 err = scm_send(socket, &msg, scm, false);
1557 if (unix_passcred_enabled(socket, other)) {
1558 scm->pid = get_pid(task_tgid(current));
1559 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1564 static bool unix_skb_scm_eq(struct sk_buff *skb,
1565 struct scm_cookie *scm)
1567 const struct unix_skb_parms *u = &UNIXCB(skb);
1569 return u->pid == scm->pid &&
1570 uid_eq(u->uid, scm->creds.uid) &&
1571 gid_eq(u->gid, scm->creds.gid) &&
1572 unix_secdata_eq(scm, skb);
1576 * Send AF_UNIX data.
1579 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1582 struct sock *sk = sock->sk;
1583 struct net *net = sock_net(sk);
1584 struct unix_sock *u = unix_sk(sk);
1585 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1586 struct sock *other = NULL;
1587 int namelen = 0; /* fake GCC */
1590 struct sk_buff *skb;
1592 struct scm_cookie scm;
1597 err = scm_send(sock, msg, &scm, false);
1602 if (msg->msg_flags&MSG_OOB)
1605 if (msg->msg_namelen) {
1606 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1613 other = unix_peer_get(sk);
1618 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1619 && (err = unix_autobind(sock)) != 0)
1623 if (len > sk->sk_sndbuf - 32)
1626 if (len > SKB_MAX_ALLOC) {
1627 data_len = min_t(size_t,
1628 len - SKB_MAX_ALLOC,
1629 MAX_SKB_FRAGS * PAGE_SIZE);
1630 data_len = PAGE_ALIGN(data_len);
1632 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1635 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1636 msg->msg_flags & MSG_DONTWAIT, &err,
1637 PAGE_ALLOC_COSTLY_ORDER);
1641 err = unix_scm_to_skb(&scm, skb, true);
1645 skb_put(skb, len - data_len);
1646 skb->data_len = data_len;
1648 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1652 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1657 if (sunaddr == NULL)
1660 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1666 if (sk_filter(other, skb) < 0) {
1667 /* Toss the packet but do not return any error to the sender */
1673 unix_state_lock(other);
1676 if (!unix_may_send(sk, other))
1679 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1681 * Check with 1003.1g - what should
1684 unix_state_unlock(other);
1688 unix_state_lock(sk);
1691 if (unix_peer(sk) == other) {
1692 unix_peer(sk) = NULL;
1693 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1695 unix_state_unlock(sk);
1697 unix_dgram_disconnected(sk, other);
1699 err = -ECONNREFUSED;
1701 unix_state_unlock(sk);
1711 if (other->sk_shutdown & RCV_SHUTDOWN)
1714 if (sk->sk_type != SOCK_SEQPACKET) {
1715 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1720 /* other == sk && unix_peer(other) != sk if
1721 * - unix_peer(sk) == NULL, destination address bound to sk
1722 * - unix_peer(sk) == sk by time of get but disconnected before lock
1725 unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
1727 timeo = unix_wait_for_peer(other, timeo);
1729 err = sock_intr_errno(timeo);
1730 if (signal_pending(current))
1737 unix_state_unlock(other);
1738 unix_state_double_lock(sk, other);
1741 if (unix_peer(sk) != other ||
1742 unix_dgram_peer_wake_me(sk, other)) {
1750 goto restart_locked;
1754 if (unlikely(sk_locked))
1755 unix_state_unlock(sk);
1757 if (sock_flag(other, SOCK_RCVTSTAMP))
1758 __net_timestamp(skb);
1759 maybe_add_creds(skb, sock, other);
1760 skb_queue_tail(&other->sk_receive_queue, skb);
1761 unix_state_unlock(other);
1762 other->sk_data_ready(other);
1769 unix_state_unlock(sk);
1770 unix_state_unlock(other);
1780 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1781 * bytes, and a minimum of a full page.
1783 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1785 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1788 struct sock *sk = sock->sk;
1789 struct sock *other = NULL;
1791 struct sk_buff *skb;
1793 struct scm_cookie scm;
1794 bool fds_sent = false;
1798 err = scm_send(sock, msg, &scm, false);
1803 if (msg->msg_flags&MSG_OOB)
1806 if (msg->msg_namelen) {
1807 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1811 other = unix_peer(sk);
1816 if (sk->sk_shutdown & SEND_SHUTDOWN)
1819 while (sent < len) {
1822 /* Keep two messages in the pipe so it schedules better */
1823 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
1825 /* allow fallback to order-0 allocations */
1826 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
1828 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
1830 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
1832 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
1833 msg->msg_flags & MSG_DONTWAIT, &err,
1834 get_order(UNIX_SKB_FRAGS_SZ));
1838 /* Only send the fds in the first buffer */
1839 err = unix_scm_to_skb(&scm, skb, !fds_sent);
1846 skb_put(skb, size - data_len);
1847 skb->data_len = data_len;
1849 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
1855 unix_state_lock(other);
1857 if (sock_flag(other, SOCK_DEAD) ||
1858 (other->sk_shutdown & RCV_SHUTDOWN))
1861 maybe_add_creds(skb, sock, other);
1862 skb_queue_tail(&other->sk_receive_queue, skb);
1863 unix_state_unlock(other);
1864 other->sk_data_ready(other);
1873 unix_state_unlock(other);
1876 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
1877 send_sig(SIGPIPE, current, 0);
1881 return sent ? : err;
1884 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
1885 int offset, size_t size, int flags)
1888 bool send_sigpipe = false;
1889 bool init_scm = true;
1890 struct scm_cookie scm;
1891 struct sock *other, *sk = socket->sk;
1892 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
1894 if (flags & MSG_OOB)
1897 other = unix_peer(sk);
1898 if (!other || sk->sk_state != TCP_ESTABLISHED)
1903 unix_state_unlock(other);
1904 mutex_unlock(&unix_sk(other)->iolock);
1905 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
1911 /* we must acquire iolock as we modify already present
1912 * skbs in the sk_receive_queue and mess with skb->len
1914 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
1916 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
1920 if (sk->sk_shutdown & SEND_SHUTDOWN) {
1922 send_sigpipe = true;
1926 unix_state_lock(other);
1928 if (sock_flag(other, SOCK_DEAD) ||
1929 other->sk_shutdown & RCV_SHUTDOWN) {
1931 send_sigpipe = true;
1932 goto err_state_unlock;
1936 err = maybe_init_creds(&scm, socket, other);
1938 goto err_state_unlock;
1942 skb = skb_peek_tail(&other->sk_receive_queue);
1943 if (tail && tail == skb) {
1945 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
1952 } else if (newskb) {
1953 /* this is fast path, we don't necessarily need to
1954 * call to kfree_skb even though with newskb == NULL
1955 * this - does no harm
1957 consume_skb(newskb);
1961 if (skb_append_pagefrags(skb, page, offset, size)) {
1967 skb->data_len += size;
1968 skb->truesize += size;
1969 refcount_add(size, &sk->sk_wmem_alloc);
1972 err = unix_scm_to_skb(&scm, skb, false);
1974 goto err_state_unlock;
1975 spin_lock(&other->sk_receive_queue.lock);
1976 __skb_queue_tail(&other->sk_receive_queue, newskb);
1977 spin_unlock(&other->sk_receive_queue.lock);
1980 unix_state_unlock(other);
1981 mutex_unlock(&unix_sk(other)->iolock);
1983 other->sk_data_ready(other);
1988 unix_state_unlock(other);
1990 mutex_unlock(&unix_sk(other)->iolock);
1993 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
1994 send_sig(SIGPIPE, current, 0);
2000 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2004 struct sock *sk = sock->sk;
2006 err = sock_error(sk);
2010 if (sk->sk_state != TCP_ESTABLISHED)
2013 if (msg->msg_namelen)
2014 msg->msg_namelen = 0;
2016 return unix_dgram_sendmsg(sock, msg, len);
2019 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2020 size_t size, int flags)
2022 struct sock *sk = sock->sk;
2024 if (sk->sk_state != TCP_ESTABLISHED)
2027 return unix_dgram_recvmsg(sock, msg, size, flags);
2030 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2032 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2035 msg->msg_namelen = addr->len;
2036 memcpy(msg->msg_name, addr->name, addr->len);
2040 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
2041 size_t size, int flags)
2043 struct scm_cookie scm;
2044 struct sock *sk = sock->sk;
2045 struct unix_sock *u = unix_sk(sk);
2046 struct sk_buff *skb, *last;
2055 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2058 mutex_lock(&u->iolock);
2060 skip = sk_peek_offset(sk, flags);
2061 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2062 NULL, &skip, &err, &last);
2066 mutex_unlock(&u->iolock);
2071 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2072 &err, &timeo, last));
2074 if (!skb) { /* implies iolock unlocked */
2075 unix_state_lock(sk);
2076 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2077 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2078 (sk->sk_shutdown & RCV_SHUTDOWN))
2080 unix_state_unlock(sk);
2084 if (wq_has_sleeper(&u->peer_wait))
2085 wake_up_interruptible_sync_poll(&u->peer_wait,
2086 EPOLLOUT | EPOLLWRNORM |
2090 unix_copy_addr(msg, skb->sk);
2092 if (size > skb->len - skip)
2093 size = skb->len - skip;
2094 else if (size < skb->len - skip)
2095 msg->msg_flags |= MSG_TRUNC;
2097 err = skb_copy_datagram_msg(skb, skip, msg, size);
2101 if (sock_flag(sk, SOCK_RCVTSTAMP))
2102 __sock_recv_timestamp(msg, sk, skb);
2104 memset(&scm, 0, sizeof(scm));
2106 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2107 unix_set_secdata(&scm, skb);
2109 if (!(flags & MSG_PEEK)) {
2111 unix_detach_fds(&scm, skb);
2113 sk_peek_offset_bwd(sk, skb->len);
2115 /* It is questionable: on PEEK we could:
2116 - do not return fds - good, but too simple 8)
2117 - return fds, and do not return them on read (old strategy,
2119 - clone fds (I chose it for now, it is the most universal
2122 POSIX 1003.1g does not actually define this clearly
2123 at all. POSIX 1003.1g doesn't define a lot of things
2128 sk_peek_offset_fwd(sk, size);
2131 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2133 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2135 scm_recv(sock, msg, &scm, flags);
2138 skb_free_datagram(sk, skb);
2139 mutex_unlock(&u->iolock);
2145 * Sleep until more data has arrived. But check for races..
2147 static long unix_stream_data_wait(struct sock *sk, long timeo,
2148 struct sk_buff *last, unsigned int last_len,
2151 struct sk_buff *tail;
2154 unix_state_lock(sk);
2157 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2159 tail = skb_peek_tail(&sk->sk_receive_queue);
2161 (tail && tail->len != last_len) ||
2163 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2164 signal_pending(current) ||
2168 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2169 unix_state_unlock(sk);
2171 timeo = freezable_schedule_timeout(timeo);
2173 timeo = schedule_timeout(timeo);
2174 unix_state_lock(sk);
2176 if (sock_flag(sk, SOCK_DEAD))
2179 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2182 finish_wait(sk_sleep(sk), &wait);
2183 unix_state_unlock(sk);
2187 static unsigned int unix_skb_len(const struct sk_buff *skb)
2189 return skb->len - UNIXCB(skb).consumed;
2192 struct unix_stream_read_state {
2193 int (*recv_actor)(struct sk_buff *, int, int,
2194 struct unix_stream_read_state *);
2195 struct socket *socket;
2197 struct pipe_inode_info *pipe;
2200 unsigned int splice_flags;
2203 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2206 struct scm_cookie scm;
2207 struct socket *sock = state->socket;
2208 struct sock *sk = sock->sk;
2209 struct unix_sock *u = unix_sk(sk);
2211 int flags = state->flags;
2212 int noblock = flags & MSG_DONTWAIT;
2213 bool check_creds = false;
2218 size_t size = state->size;
2219 unsigned int last_len;
2221 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2226 if (unlikely(flags & MSG_OOB)) {
2231 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2232 timeo = sock_rcvtimeo(sk, noblock);
2234 memset(&scm, 0, sizeof(scm));
2236 /* Lock the socket to prevent queue disordering
2237 * while sleeps in memcpy_tomsg
2239 mutex_lock(&u->iolock);
2241 skip = max(sk_peek_offset(sk, flags), 0);
2246 struct sk_buff *skb, *last;
2249 unix_state_lock(sk);
2250 if (sock_flag(sk, SOCK_DEAD)) {
2254 last = skb = skb_peek(&sk->sk_receive_queue);
2255 last_len = last ? last->len : 0;
2258 if (copied >= target)
2262 * POSIX 1003.1g mandates this order.
2265 err = sock_error(sk);
2268 if (sk->sk_shutdown & RCV_SHUTDOWN)
2271 unix_state_unlock(sk);
2277 mutex_unlock(&u->iolock);
2279 timeo = unix_stream_data_wait(sk, timeo, last,
2280 last_len, freezable);
2282 if (signal_pending(current)) {
2283 err = sock_intr_errno(timeo);
2288 mutex_lock(&u->iolock);
2291 unix_state_unlock(sk);
2295 while (skip >= unix_skb_len(skb)) {
2296 skip -= unix_skb_len(skb);
2298 last_len = skb->len;
2299 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2304 unix_state_unlock(sk);
2307 /* Never glue messages from different writers */
2308 if (!unix_skb_scm_eq(skb, &scm))
2310 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2311 /* Copy credentials */
2312 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2313 unix_set_secdata(&scm, skb);
2317 /* Copy address just once */
2318 if (state->msg && state->msg->msg_name) {
2319 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2320 state->msg->msg_name);
2321 unix_copy_addr(state->msg, skb->sk);
2325 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2327 chunk = state->recv_actor(skb, skip, chunk, state);
2328 drop_skb = !unix_skb_len(skb);
2329 /* skb is only safe to use if !drop_skb */
2340 /* the skb was touched by a concurrent reader;
2341 * we should not expect anything from this skb
2342 * anymore and assume it invalid - we can be
2343 * sure it was dropped from the socket queue
2345 * let's report a short read
2351 /* Mark read part of skb as used */
2352 if (!(flags & MSG_PEEK)) {
2353 UNIXCB(skb).consumed += chunk;
2355 sk_peek_offset_bwd(sk, chunk);
2358 unix_detach_fds(&scm, skb);
2360 if (unix_skb_len(skb))
2363 skb_unlink(skb, &sk->sk_receive_queue);
2369 /* It is questionable, see note in unix_dgram_recvmsg.
2372 scm.fp = scm_fp_dup(UNIXCB(skb).fp);
2374 sk_peek_offset_fwd(sk, chunk);
2381 last_len = skb->len;
2382 unix_state_lock(sk);
2383 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2386 unix_state_unlock(sk);
2391 mutex_unlock(&u->iolock);
2393 scm_recv(sock, state->msg, &scm, flags);
2397 return copied ? : err;
2400 static int unix_stream_read_actor(struct sk_buff *skb,
2401 int skip, int chunk,
2402 struct unix_stream_read_state *state)
2406 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2408 return ret ?: chunk;
2411 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2412 size_t size, int flags)
2414 struct unix_stream_read_state state = {
2415 .recv_actor = unix_stream_read_actor,
2422 return unix_stream_read_generic(&state, true);
2425 static int unix_stream_splice_actor(struct sk_buff *skb,
2426 int skip, int chunk,
2427 struct unix_stream_read_state *state)
2429 return skb_splice_bits(skb, state->socket->sk,
2430 UNIXCB(skb).consumed + skip,
2431 state->pipe, chunk, state->splice_flags);
2434 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2435 struct pipe_inode_info *pipe,
2436 size_t size, unsigned int flags)
2438 struct unix_stream_read_state state = {
2439 .recv_actor = unix_stream_splice_actor,
2443 .splice_flags = flags,
2446 if (unlikely(*ppos))
2449 if (sock->file->f_flags & O_NONBLOCK ||
2450 flags & SPLICE_F_NONBLOCK)
2451 state.flags = MSG_DONTWAIT;
2453 return unix_stream_read_generic(&state, false);
2456 static int unix_shutdown(struct socket *sock, int mode)
2458 struct sock *sk = sock->sk;
2461 if (mode < SHUT_RD || mode > SHUT_RDWR)
2464 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2465 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2466 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2470 unix_state_lock(sk);
2471 sk->sk_shutdown |= mode;
2472 other = unix_peer(sk);
2475 unix_state_unlock(sk);
2476 sk->sk_state_change(sk);
2479 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2483 if (mode&RCV_SHUTDOWN)
2484 peer_mode |= SEND_SHUTDOWN;
2485 if (mode&SEND_SHUTDOWN)
2486 peer_mode |= RCV_SHUTDOWN;
2487 unix_state_lock(other);
2488 other->sk_shutdown |= peer_mode;
2489 unix_state_unlock(other);
2490 other->sk_state_change(other);
2491 if (peer_mode == SHUTDOWN_MASK)
2492 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2493 else if (peer_mode & RCV_SHUTDOWN)
2494 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2502 long unix_inq_len(struct sock *sk)
2504 struct sk_buff *skb;
2507 if (sk->sk_state == TCP_LISTEN)
2510 spin_lock(&sk->sk_receive_queue.lock);
2511 if (sk->sk_type == SOCK_STREAM ||
2512 sk->sk_type == SOCK_SEQPACKET) {
2513 skb_queue_walk(&sk->sk_receive_queue, skb)
2514 amount += unix_skb_len(skb);
2516 skb = skb_peek(&sk->sk_receive_queue);
2520 spin_unlock(&sk->sk_receive_queue.lock);
2524 EXPORT_SYMBOL_GPL(unix_inq_len);
2526 long unix_outq_len(struct sock *sk)
2528 return sk_wmem_alloc_get(sk);
2530 EXPORT_SYMBOL_GPL(unix_outq_len);
2532 static int unix_open_file(struct sock *sk)
2538 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2541 if (!smp_load_acquire(&unix_sk(sk)->addr))
2544 path = unix_sk(sk)->path;
2550 fd = get_unused_fd_flags(O_CLOEXEC);
2554 f = dentry_open(&path, O_PATH, current_cred());
2568 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2570 struct sock *sk = sock->sk;
2576 amount = unix_outq_len(sk);
2577 err = put_user(amount, (int __user *)arg);
2580 amount = unix_inq_len(sk);
2584 err = put_user(amount, (int __user *)arg);
2587 err = unix_open_file(sk);
2596 #ifdef CONFIG_COMPAT
2597 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2599 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2603 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
2605 struct sock *sk = sock->sk;
2608 sock_poll_wait(file, sock, wait);
2611 /* exceptional events? */
2614 if (sk->sk_shutdown == SHUTDOWN_MASK)
2616 if (sk->sk_shutdown & RCV_SHUTDOWN)
2617 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2620 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2621 mask |= EPOLLIN | EPOLLRDNORM;
2623 /* Connection-based need to check for termination and startup */
2624 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
2625 sk->sk_state == TCP_CLOSE)
2629 * we set writable also when the other side has shut down the
2630 * connection. This prevents stuck sockets.
2632 if (unix_writable(sk))
2633 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2638 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
2641 struct sock *sk = sock->sk, *other;
2642 unsigned int writable;
2645 sock_poll_wait(file, sock, wait);
2648 /* exceptional events? */
2649 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
2651 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
2653 if (sk->sk_shutdown & RCV_SHUTDOWN)
2654 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
2655 if (sk->sk_shutdown == SHUTDOWN_MASK)
2659 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
2660 mask |= EPOLLIN | EPOLLRDNORM;
2662 /* Connection-based need to check for termination and startup */
2663 if (sk->sk_type == SOCK_SEQPACKET) {
2664 if (sk->sk_state == TCP_CLOSE)
2666 /* connection hasn't started yet? */
2667 if (sk->sk_state == TCP_SYN_SENT)
2671 /* No write status requested, avoid expensive OUT tests. */
2672 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
2675 writable = unix_writable(sk);
2677 unix_state_lock(sk);
2679 other = unix_peer(sk);
2680 if (other && unix_peer(other) != sk &&
2681 unix_recvq_full(other) &&
2682 unix_dgram_peer_wake_me(sk, other))
2685 unix_state_unlock(sk);
2689 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
2691 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2696 #ifdef CONFIG_PROC_FS
2698 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
2700 #define get_bucket(x) ((x) >> BUCKET_SPACE)
2701 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
2702 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
2704 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
2706 unsigned long offset = get_offset(*pos);
2707 unsigned long bucket = get_bucket(*pos);
2709 unsigned long count = 0;
2711 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
2712 if (sock_net(sk) != seq_file_net(seq))
2714 if (++count == offset)
2721 static struct sock *unix_next_socket(struct seq_file *seq,
2725 unsigned long bucket;
2727 while (sk > (struct sock *)SEQ_START_TOKEN) {
2731 if (sock_net(sk) == seq_file_net(seq))
2736 sk = unix_from_bucket(seq, pos);
2741 bucket = get_bucket(*pos) + 1;
2742 *pos = set_bucket_offset(bucket, 1);
2743 } while (bucket < ARRAY_SIZE(unix_socket_table));
2748 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
2749 __acquires(unix_table_lock)
2751 spin_lock(&unix_table_lock);
2754 return SEQ_START_TOKEN;
2756 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
2759 return unix_next_socket(seq, NULL, pos);
2762 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2765 return unix_next_socket(seq, v, pos);
2768 static void unix_seq_stop(struct seq_file *seq, void *v)
2769 __releases(unix_table_lock)
2771 spin_unlock(&unix_table_lock);
2774 static int unix_seq_show(struct seq_file *seq, void *v)
2777 if (v == SEQ_START_TOKEN)
2778 seq_puts(seq, "Num RefCount Protocol Flags Type St "
2782 struct unix_sock *u = unix_sk(s);
2785 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
2787 refcount_read(&s->sk_refcnt),
2789 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
2792 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
2793 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
2796 if (u->addr) { // under unix_table_lock here
2801 len = u->addr->len - sizeof(short);
2802 if (!UNIX_ABSTRACT(s))
2808 for ( ; i < len; i++)
2809 seq_putc(seq, u->addr->name->sun_path[i] ?:
2812 unix_state_unlock(s);
2813 seq_putc(seq, '\n');
2819 static const struct seq_operations unix_seq_ops = {
2820 .start = unix_seq_start,
2821 .next = unix_seq_next,
2822 .stop = unix_seq_stop,
2823 .show = unix_seq_show,
2827 static const struct net_proto_family unix_family_ops = {
2829 .create = unix_create,
2830 .owner = THIS_MODULE,
2834 static int __net_init unix_net_init(struct net *net)
2836 int error = -ENOMEM;
2838 net->unx.sysctl_max_dgram_qlen = 10;
2839 if (unix_sysctl_register(net))
2842 #ifdef CONFIG_PROC_FS
2843 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
2844 sizeof(struct seq_net_private))) {
2845 unix_sysctl_unregister(net);
2854 static void __net_exit unix_net_exit(struct net *net)
2856 unix_sysctl_unregister(net);
2857 remove_proc_entry("unix", net->proc_net);
2860 static struct pernet_operations unix_net_ops = {
2861 .init = unix_net_init,
2862 .exit = unix_net_exit,
2865 static int __init af_unix_init(void)
2869 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > FIELD_SIZEOF(struct sk_buff, cb));
2871 rc = proto_register(&unix_proto, 1);
2873 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
2877 sock_register(&unix_family_ops);
2878 register_pernet_subsys(&unix_net_ops);
2883 static void __exit af_unix_exit(void)
2885 sock_unregister(PF_UNIX);
2886 proto_unregister(&unix_proto);
2887 unregister_pernet_subsys(&unix_net_ops);
2890 /* Earlier than device_initcall() so that other drivers invoking
2891 request_module() don't end up in a loop when modprobe tries
2892 to use a UNIX socket. But later than subsys_initcall() because
2893 we depend on stuff initialised there */
2894 fs_initcall(af_unix_init);
2895 module_exit(af_unix_exit);
2897 MODULE_LICENSE("GPL");
2898 MODULE_ALIAS_NETPROTO(PF_UNIX);