1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/termios.h>
93 #include <linux/sockios.h>
94 #include <linux/net.h>
97 #include <linux/slab.h>
98 #include <linux/uaccess.h>
99 #include <linux/skbuff.h>
100 #include <linux/netdevice.h>
101 #include <net/net_namespace.h>
102 #include <net/sock.h>
103 #include <net/tcp_states.h>
104 #include <net/af_unix.h>
105 #include <linux/proc_fs.h>
106 #include <linux/seq_file.h>
108 #include <linux/init.h>
109 #include <linux/poll.h>
110 #include <linux/rtnetlink.h>
111 #include <linux/mount.h>
112 #include <net/checksum.h>
113 #include <linux/security.h>
114 #include <linux/freezer.h>
115 #include <linux/file.h>
116 #include <linux/btf_ids.h>
120 struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
121 EXPORT_SYMBOL_GPL(unix_socket_table);
122 DEFINE_SPINLOCK(unix_table_lock);
123 EXPORT_SYMBOL_GPL(unix_table_lock);
124 static atomic_long_t unix_nr_socks;
127 static struct hlist_head *unix_sockets_unbound(void *addr)
129 unsigned long hash = (unsigned long)addr;
133 hash %= UNIX_HASH_SIZE;
134 return &unix_socket_table[UNIX_HASH_SIZE + hash];
137 #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash < UNIX_HASH_SIZE)
139 #ifdef CONFIG_SECURITY_NETWORK
140 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
142 UNIXCB(skb).secid = scm->secid;
145 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
147 scm->secid = UNIXCB(skb).secid;
150 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
152 return (scm->secid == UNIXCB(skb).secid);
155 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
158 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
161 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
165 #endif /* CONFIG_SECURITY_NETWORK */
168 * SMP locking strategy:
169 * hash table is protected with spinlock unix_table_lock
170 * each socket state is protected by separate spin lock.
173 static inline unsigned int unix_hash_fold(__wsum n)
175 unsigned int hash = (__force unsigned int)csum_fold(n);
178 return hash&(UNIX_HASH_SIZE-1);
181 #define unix_peer(sk) (unix_sk(sk)->peer)
183 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
185 return unix_peer(osk) == sk;
188 static inline int unix_may_send(struct sock *sk, struct sock *osk)
190 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
193 static inline int unix_recvq_full(const struct sock *sk)
195 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
198 static inline int unix_recvq_full_lockless(const struct sock *sk)
200 return skb_queue_len_lockless(&sk->sk_receive_queue) >
201 READ_ONCE(sk->sk_max_ack_backlog);
204 struct sock *unix_peer_get(struct sock *s)
212 unix_state_unlock(s);
215 EXPORT_SYMBOL_GPL(unix_peer_get);
217 static inline void unix_release_addr(struct unix_address *addr)
219 if (refcount_dec_and_test(&addr->refcnt))
224 * Check unix socket name:
225 * - should be not zero length.
226 * - if started by not zero, should be NULL terminated (FS object)
227 * - if started by zero, it is abstract name.
230 static int unix_mkname(struct sockaddr_un *sunaddr, int len, unsigned int *hashp)
234 if (len <= sizeof(short) || len > sizeof(*sunaddr))
236 if (!sunaddr || sunaddr->sun_family != AF_UNIX)
238 if (sunaddr->sun_path[0]) {
240 * This may look like an off by one error but it is a bit more
241 * subtle. 108 is the longest valid AF_UNIX path for a binding.
242 * sun_path[108] doesn't as such exist. However in kernel space
243 * we are guaranteed that it is a valid memory location in our
244 * kernel address buffer.
246 ((char *)sunaddr)[len] = 0;
247 len = strlen(sunaddr->sun_path)+1+sizeof(short);
251 *hashp = unix_hash_fold(csum_partial(sunaddr, len, 0));
255 static void __unix_remove_socket(struct sock *sk)
257 sk_del_node_init(sk);
260 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
262 WARN_ON(!sk_unhashed(sk));
263 sk_add_node(sk, list);
266 static void __unix_set_addr(struct sock *sk, struct unix_address *addr,
269 __unix_remove_socket(sk);
270 smp_store_release(&unix_sk(sk)->addr, addr);
271 __unix_insert_socket(&unix_socket_table[hash], sk);
274 static inline void unix_remove_socket(struct sock *sk)
276 spin_lock(&unix_table_lock);
277 __unix_remove_socket(sk);
278 spin_unlock(&unix_table_lock);
281 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
283 spin_lock(&unix_table_lock);
284 __unix_insert_socket(list, sk);
285 spin_unlock(&unix_table_lock);
288 static struct sock *__unix_find_socket_byname(struct net *net,
289 struct sockaddr_un *sunname,
290 int len, unsigned int hash)
294 sk_for_each(s, &unix_socket_table[hash]) {
295 struct unix_sock *u = unix_sk(s);
297 if (!net_eq(sock_net(s), net))
300 if (u->addr->len == len &&
301 !memcmp(u->addr->name, sunname, len))
307 static inline struct sock *unix_find_socket_byname(struct net *net,
308 struct sockaddr_un *sunname,
309 int len, unsigned int hash)
313 spin_lock(&unix_table_lock);
314 s = __unix_find_socket_byname(net, sunname, len, hash);
317 spin_unlock(&unix_table_lock);
321 static struct sock *unix_find_socket_byinode(struct inode *i)
325 spin_lock(&unix_table_lock);
327 &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
328 struct dentry *dentry = unix_sk(s)->path.dentry;
330 if (dentry && d_backing_inode(dentry) == i) {
337 spin_unlock(&unix_table_lock);
341 /* Support code for asymmetrically connected dgram sockets
343 * If a datagram socket is connected to a socket not itself connected
344 * to the first socket (eg, /dev/log), clients may only enqueue more
345 * messages if the present receive queue of the server socket is not
346 * "too large". This means there's a second writeability condition
347 * poll and sendmsg need to test. The dgram recv code will do a wake
348 * up on the peer_wait wait queue of a socket upon reception of a
349 * datagram which needs to be propagated to sleeping would-be writers
350 * since these might not have sent anything so far. This can't be
351 * accomplished via poll_wait because the lifetime of the server
352 * socket might be less than that of its clients if these break their
353 * association with it or if the server socket is closed while clients
354 * are still connected to it and there's no way to inform "a polling
355 * implementation" that it should let go of a certain wait queue
357 * In order to propagate a wake up, a wait_queue_entry_t of the client
358 * socket is enqueued on the peer_wait queue of the server socket
359 * whose wake function does a wake_up on the ordinary client socket
360 * wait queue. This connection is established whenever a write (or
361 * poll for write) hit the flow control condition and broken when the
362 * association to the server socket is dissolved or after a wake up
366 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
370 wait_queue_head_t *u_sleep;
372 u = container_of(q, struct unix_sock, peer_wake);
374 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
376 u->peer_wake.private = NULL;
378 /* relaying can only happen while the wq still exists */
379 u_sleep = sk_sleep(&u->sk);
381 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
386 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
388 struct unix_sock *u, *u_other;
392 u_other = unix_sk(other);
394 spin_lock(&u_other->peer_wait.lock);
396 if (!u->peer_wake.private) {
397 u->peer_wake.private = other;
398 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
403 spin_unlock(&u_other->peer_wait.lock);
407 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
410 struct unix_sock *u, *u_other;
413 u_other = unix_sk(other);
414 spin_lock(&u_other->peer_wait.lock);
416 if (u->peer_wake.private == other) {
417 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
418 u->peer_wake.private = NULL;
421 spin_unlock(&u_other->peer_wait.lock);
424 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
427 unix_dgram_peer_wake_disconnect(sk, other);
428 wake_up_interruptible_poll(sk_sleep(sk),
435 * - unix_peer(sk) == other
436 * - association is stable
438 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
442 connected = unix_dgram_peer_wake_connect(sk, other);
444 /* If other is SOCK_DEAD, we want to make sure we signal
445 * POLLOUT, such that a subsequent write() can get a
446 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
447 * to other and its full, we will hang waiting for POLLOUT.
449 if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD))
453 unix_dgram_peer_wake_disconnect(sk, other);
458 static int unix_writable(const struct sock *sk)
460 return sk->sk_state != TCP_LISTEN &&
461 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
464 static void unix_write_space(struct sock *sk)
466 struct socket_wq *wq;
469 if (unix_writable(sk)) {
470 wq = rcu_dereference(sk->sk_wq);
471 if (skwq_has_sleeper(wq))
472 wake_up_interruptible_sync_poll(&wq->wait,
473 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
474 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
479 /* When dgram socket disconnects (or changes its peer), we clear its receive
480 * queue of packets arrived from previous peer. First, it allows to do
481 * flow control based only on wmem_alloc; second, sk connected to peer
482 * may receive messages only from that peer. */
483 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
485 if (!skb_queue_empty(&sk->sk_receive_queue)) {
486 skb_queue_purge(&sk->sk_receive_queue);
487 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
489 /* If one link of bidirectional dgram pipe is disconnected,
490 * we signal error. Messages are lost. Do not make this,
491 * when peer was not connected to us.
493 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
494 other->sk_err = ECONNRESET;
495 sk_error_report(other);
498 other->sk_state = TCP_CLOSE;
501 static void unix_sock_destructor(struct sock *sk)
503 struct unix_sock *u = unix_sk(sk);
505 skb_queue_purge(&sk->sk_receive_queue);
507 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
509 kfree_skb(u->oob_skb);
513 WARN_ON(refcount_read(&sk->sk_wmem_alloc));
514 WARN_ON(!sk_unhashed(sk));
515 WARN_ON(sk->sk_socket);
516 if (!sock_flag(sk, SOCK_DEAD)) {
517 pr_info("Attempt to release alive unix socket: %p\n", sk);
522 unix_release_addr(u->addr);
524 atomic_long_dec(&unix_nr_socks);
526 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
528 #ifdef UNIX_REFCNT_DEBUG
529 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
530 atomic_long_read(&unix_nr_socks));
534 static void unix_release_sock(struct sock *sk, int embrion)
536 struct unix_sock *u = unix_sk(sk);
542 unix_remove_socket(sk);
547 sk->sk_shutdown = SHUTDOWN_MASK;
549 u->path.dentry = NULL;
551 state = sk->sk_state;
552 sk->sk_state = TCP_CLOSE;
554 skpair = unix_peer(sk);
555 unix_peer(sk) = NULL;
557 unix_state_unlock(sk);
559 wake_up_interruptible_all(&u->peer_wait);
561 if (skpair != NULL) {
562 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
563 unix_state_lock(skpair);
565 skpair->sk_shutdown = SHUTDOWN_MASK;
566 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
567 skpair->sk_err = ECONNRESET;
568 unix_state_unlock(skpair);
569 skpair->sk_state_change(skpair);
570 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
573 unix_dgram_peer_wake_disconnect(sk, skpair);
574 sock_put(skpair); /* It may now die */
577 /* Try to flush out this socket. Throw out buffers at least */
579 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
580 if (state == TCP_LISTEN)
581 unix_release_sock(skb->sk, 1);
582 /* passed fds are erased in the kfree_skb hook */
583 UNIXCB(skb).consumed = skb->len;
592 /* ---- Socket is dead now and most probably destroyed ---- */
595 * Fixme: BSD difference: In BSD all sockets connected to us get
596 * ECONNRESET and we die on the spot. In Linux we behave
597 * like files and pipes do and wait for the last
600 * Can't we simply set sock->err?
602 * What the above comment does talk about? --ANK(980817)
605 if (unix_tot_inflight)
606 unix_gc(); /* Garbage collect fds */
609 static void init_peercred(struct sock *sk)
611 put_pid(sk->sk_peer_pid);
612 if (sk->sk_peer_cred)
613 put_cred(sk->sk_peer_cred);
614 sk->sk_peer_pid = get_pid(task_tgid(current));
615 sk->sk_peer_cred = get_current_cred();
618 static void copy_peercred(struct sock *sk, struct sock *peersk)
620 put_pid(sk->sk_peer_pid);
621 if (sk->sk_peer_cred)
622 put_cred(sk->sk_peer_cred);
623 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
624 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
627 static int unix_listen(struct socket *sock, int backlog)
630 struct sock *sk = sock->sk;
631 struct unix_sock *u = unix_sk(sk);
634 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
635 goto out; /* Only stream/seqpacket sockets accept */
638 goto out; /* No listens on an unbound socket */
640 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
642 if (backlog > sk->sk_max_ack_backlog)
643 wake_up_interruptible_all(&u->peer_wait);
644 sk->sk_max_ack_backlog = backlog;
645 sk->sk_state = TCP_LISTEN;
646 /* set credentials so connect can copy them */
651 unix_state_unlock(sk);
656 static int unix_release(struct socket *);
657 static int unix_bind(struct socket *, struct sockaddr *, int);
658 static int unix_stream_connect(struct socket *, struct sockaddr *,
659 int addr_len, int flags);
660 static int unix_socketpair(struct socket *, struct socket *);
661 static int unix_accept(struct socket *, struct socket *, int, bool);
662 static int unix_getname(struct socket *, struct sockaddr *, int);
663 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
664 static __poll_t unix_dgram_poll(struct file *, struct socket *,
666 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
668 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
670 static int unix_shutdown(struct socket *, int);
671 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
672 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
673 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
674 size_t size, int flags);
675 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
676 struct pipe_inode_info *, size_t size,
678 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
679 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
680 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
681 sk_read_actor_t recv_actor);
682 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
683 sk_read_actor_t recv_actor);
684 static int unix_dgram_connect(struct socket *, struct sockaddr *,
686 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
687 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
690 static int unix_set_peek_off(struct sock *sk, int val)
692 struct unix_sock *u = unix_sk(sk);
694 if (mutex_lock_interruptible(&u->iolock))
697 sk->sk_peek_off = val;
698 mutex_unlock(&u->iolock);
703 #ifdef CONFIG_PROC_FS
704 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
706 struct sock *sk = sock->sk;
710 u = unix_sk(sock->sk);
711 seq_printf(m, "scm_fds: %u\n",
712 atomic_read(&u->scm_stat.nr_fds));
716 #define unix_show_fdinfo NULL
719 static const struct proto_ops unix_stream_ops = {
721 .owner = THIS_MODULE,
722 .release = unix_release,
724 .connect = unix_stream_connect,
725 .socketpair = unix_socketpair,
726 .accept = unix_accept,
727 .getname = unix_getname,
731 .compat_ioctl = unix_compat_ioctl,
733 .listen = unix_listen,
734 .shutdown = unix_shutdown,
735 .sendmsg = unix_stream_sendmsg,
736 .recvmsg = unix_stream_recvmsg,
737 .read_sock = unix_stream_read_sock,
738 .mmap = sock_no_mmap,
739 .sendpage = unix_stream_sendpage,
740 .splice_read = unix_stream_splice_read,
741 .set_peek_off = unix_set_peek_off,
742 .show_fdinfo = unix_show_fdinfo,
745 static const struct proto_ops unix_dgram_ops = {
747 .owner = THIS_MODULE,
748 .release = unix_release,
750 .connect = unix_dgram_connect,
751 .socketpair = unix_socketpair,
752 .accept = sock_no_accept,
753 .getname = unix_getname,
754 .poll = unix_dgram_poll,
757 .compat_ioctl = unix_compat_ioctl,
759 .listen = sock_no_listen,
760 .shutdown = unix_shutdown,
761 .sendmsg = unix_dgram_sendmsg,
762 .read_sock = unix_read_sock,
763 .recvmsg = unix_dgram_recvmsg,
764 .mmap = sock_no_mmap,
765 .sendpage = sock_no_sendpage,
766 .set_peek_off = unix_set_peek_off,
767 .show_fdinfo = unix_show_fdinfo,
770 static const struct proto_ops unix_seqpacket_ops = {
772 .owner = THIS_MODULE,
773 .release = unix_release,
775 .connect = unix_stream_connect,
776 .socketpair = unix_socketpair,
777 .accept = unix_accept,
778 .getname = unix_getname,
779 .poll = unix_dgram_poll,
782 .compat_ioctl = unix_compat_ioctl,
784 .listen = unix_listen,
785 .shutdown = unix_shutdown,
786 .sendmsg = unix_seqpacket_sendmsg,
787 .recvmsg = unix_seqpacket_recvmsg,
788 .mmap = sock_no_mmap,
789 .sendpage = sock_no_sendpage,
790 .set_peek_off = unix_set_peek_off,
791 .show_fdinfo = unix_show_fdinfo,
794 static void unix_close(struct sock *sk, long timeout)
796 /* Nothing to do here, unix socket does not need a ->close().
797 * This is merely for sockmap.
801 static void unix_unhash(struct sock *sk)
803 /* Nothing to do here, unix socket does not need a ->unhash().
804 * This is merely for sockmap.
808 struct proto unix_dgram_proto = {
809 .name = "UNIX-DGRAM",
810 .owner = THIS_MODULE,
811 .obj_size = sizeof(struct unix_sock),
813 #ifdef CONFIG_BPF_SYSCALL
814 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
818 struct proto unix_stream_proto = {
819 .name = "UNIX-STREAM",
820 .owner = THIS_MODULE,
821 .obj_size = sizeof(struct unix_sock),
823 .unhash = unix_unhash,
824 #ifdef CONFIG_BPF_SYSCALL
825 .psock_update_sk_prot = unix_stream_bpf_update_proto,
829 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
831 struct sock *sk = NULL;
834 atomic_long_inc(&unix_nr_socks);
835 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files())
838 if (type == SOCK_STREAM)
839 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
840 else /*dgram and seqpacket */
841 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
846 sock_init_data(sock, sk);
848 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
849 sk->sk_write_space = unix_write_space;
850 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
851 sk->sk_destruct = unix_sock_destructor;
853 u->path.dentry = NULL;
855 spin_lock_init(&u->lock);
856 atomic_long_set(&u->inflight, 0);
857 INIT_LIST_HEAD(&u->link);
858 mutex_init(&u->iolock); /* single task reading lock */
859 mutex_init(&u->bindlock); /* single task binding lock */
860 init_waitqueue_head(&u->peer_wait);
861 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
862 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
863 unix_insert_socket(unix_sockets_unbound(sk), sk);
866 atomic_long_dec(&unix_nr_socks);
869 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
875 static int unix_create(struct net *net, struct socket *sock, int protocol,
878 if (protocol && protocol != PF_UNIX)
879 return -EPROTONOSUPPORT;
881 sock->state = SS_UNCONNECTED;
883 switch (sock->type) {
885 sock->ops = &unix_stream_ops;
888 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
892 sock->type = SOCK_DGRAM;
895 sock->ops = &unix_dgram_ops;
898 sock->ops = &unix_seqpacket_ops;
901 return -ESOCKTNOSUPPORT;
904 return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM;
907 static int unix_release(struct socket *sock)
909 struct sock *sk = sock->sk;
914 sk->sk_prot->close(sk, 0);
915 unix_release_sock(sk, 0);
921 static int unix_autobind(struct socket *sock)
923 struct sock *sk = sock->sk;
924 struct net *net = sock_net(sk);
925 struct unix_sock *u = unix_sk(sk);
926 static u32 ordernum = 1;
927 struct unix_address *addr;
929 unsigned int retries = 0;
931 err = mutex_lock_interruptible(&u->bindlock);
939 addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
943 addr->name->sun_family = AF_UNIX;
944 refcount_set(&addr->refcnt, 1);
947 addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
948 addr->hash = unix_hash_fold(csum_partial(addr->name, addr->len, 0));
949 addr->hash ^= sk->sk_type;
951 spin_lock(&unix_table_lock);
952 ordernum = (ordernum+1)&0xFFFFF;
954 if (__unix_find_socket_byname(net, addr->name, addr->len, addr->hash)) {
955 spin_unlock(&unix_table_lock);
957 * __unix_find_socket_byname() may take long time if many names
958 * are already in use.
961 /* Give up if all names seems to be in use. */
962 if (retries++ == 0xFFFFF) {
970 __unix_set_addr(sk, addr, addr->hash);
971 spin_unlock(&unix_table_lock);
974 out: mutex_unlock(&u->bindlock);
978 static struct sock *unix_find_other(struct net *net,
979 struct sockaddr_un *sunname, int len,
980 int type, unsigned int hash, int *error)
986 if (sunname->sun_path[0]) {
988 err = kern_path(sunname->sun_path, LOOKUP_FOLLOW, &path);
991 inode = d_backing_inode(path.dentry);
992 err = path_permission(&path, MAY_WRITE);
997 if (!S_ISSOCK(inode->i_mode))
999 u = unix_find_socket_byinode(inode);
1003 if (u->sk_type == type)
1009 if (u->sk_type != type) {
1014 err = -ECONNREFUSED;
1015 u = unix_find_socket_byname(net, sunname, len, type ^ hash);
1017 struct dentry *dentry;
1018 dentry = unix_sk(u)->path.dentry;
1020 touch_atime(&unix_sk(u)->path);
1033 static int unix_bind_bsd(struct sock *sk, struct unix_address *addr)
1035 struct unix_sock *u = unix_sk(sk);
1036 umode_t mode = S_IFSOCK |
1037 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1038 struct user_namespace *ns; // barf...
1040 struct dentry *dentry;
1045 * Get the parent directory, calculate the hash for last
1048 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1050 return PTR_ERR(dentry);
1051 ns = mnt_user_ns(parent.mnt);
1054 * All right, let's create it.
1056 err = security_path_mknod(&parent, dentry, mode, 0);
1058 err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1061 err = mutex_lock_interruptible(&u->bindlock);
1067 addr->hash = UNIX_HASH_SIZE;
1068 hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
1069 spin_lock(&unix_table_lock);
1070 u->path.mnt = mntget(parent.mnt);
1071 u->path.dentry = dget(dentry);
1072 __unix_set_addr(sk, addr, hash);
1073 spin_unlock(&unix_table_lock);
1074 mutex_unlock(&u->bindlock);
1075 done_path_create(&parent, dentry);
1079 mutex_unlock(&u->bindlock);
1082 /* failed after successful mknod? unlink what we'd created... */
1083 vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1085 done_path_create(&parent, dentry);
1089 static int unix_bind_abstract(struct sock *sk, struct unix_address *addr)
1091 struct unix_sock *u = unix_sk(sk);
1094 err = mutex_lock_interruptible(&u->bindlock);
1099 mutex_unlock(&u->bindlock);
1103 spin_lock(&unix_table_lock);
1104 if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len,
1106 spin_unlock(&unix_table_lock);
1107 mutex_unlock(&u->bindlock);
1110 __unix_set_addr(sk, addr, addr->hash);
1111 spin_unlock(&unix_table_lock);
1112 mutex_unlock(&u->bindlock);
1116 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1118 struct sock *sk = sock->sk;
1119 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1120 char *sun_path = sunaddr->sun_path;
1123 struct unix_address *addr;
1125 if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
1126 sunaddr->sun_family != AF_UNIX)
1129 if (addr_len == sizeof(short))
1130 return unix_autobind(sock);
1132 err = unix_mkname(sunaddr, addr_len, &hash);
1136 addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
1140 memcpy(addr->name, sunaddr, addr_len);
1141 addr->len = addr_len;
1142 addr->hash = hash ^ sk->sk_type;
1143 refcount_set(&addr->refcnt, 1);
1146 err = unix_bind_bsd(sk, addr);
1148 err = unix_bind_abstract(sk, addr);
1150 unix_release_addr(addr);
1151 return err == -EEXIST ? -EADDRINUSE : err;
1154 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1156 if (unlikely(sk1 == sk2) || !sk2) {
1157 unix_state_lock(sk1);
1161 unix_state_lock(sk1);
1162 unix_state_lock_nested(sk2);
1164 unix_state_lock(sk2);
1165 unix_state_lock_nested(sk1);
1169 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1171 if (unlikely(sk1 == sk2) || !sk2) {
1172 unix_state_unlock(sk1);
1175 unix_state_unlock(sk1);
1176 unix_state_unlock(sk2);
1179 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1180 int alen, int flags)
1182 struct sock *sk = sock->sk;
1183 struct net *net = sock_net(sk);
1184 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1190 if (alen < offsetofend(struct sockaddr, sa_family))
1193 if (addr->sa_family != AF_UNSPEC) {
1194 err = unix_mkname(sunaddr, alen, &hash);
1199 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1200 !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
1204 other = unix_find_other(net, sunaddr, alen, sock->type, hash, &err);
1208 unix_state_double_lock(sk, other);
1210 /* Apparently VFS overslept socket death. Retry. */
1211 if (sock_flag(other, SOCK_DEAD)) {
1212 unix_state_double_unlock(sk, other);
1218 if (!unix_may_send(sk, other))
1221 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1225 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1228 * 1003.1g breaking connected state with AF_UNSPEC
1231 unix_state_double_lock(sk, other);
1235 * If it was connected, reconnect.
1237 if (unix_peer(sk)) {
1238 struct sock *old_peer = unix_peer(sk);
1240 unix_peer(sk) = other;
1242 sk->sk_state = TCP_CLOSE;
1243 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1245 unix_state_double_unlock(sk, other);
1247 if (other != old_peer)
1248 unix_dgram_disconnected(sk, old_peer);
1251 unix_peer(sk) = other;
1252 unix_state_double_unlock(sk, other);
1258 unix_state_double_unlock(sk, other);
1264 static long unix_wait_for_peer(struct sock *other, long timeo)
1265 __releases(&unix_sk(other)->lock)
1267 struct unix_sock *u = unix_sk(other);
1271 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1273 sched = !sock_flag(other, SOCK_DEAD) &&
1274 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1275 unix_recvq_full(other);
1277 unix_state_unlock(other);
1280 timeo = schedule_timeout(timeo);
1282 finish_wait(&u->peer_wait, &wait);
1286 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1287 int addr_len, int flags)
1289 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1290 struct sock *sk = sock->sk;
1291 struct net *net = sock_net(sk);
1292 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1293 struct sock *newsk = NULL;
1294 struct sock *other = NULL;
1295 struct sk_buff *skb = NULL;
1301 err = unix_mkname(sunaddr, addr_len, &hash);
1306 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr &&
1307 (err = unix_autobind(sock)) != 0)
1310 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1312 /* First of all allocate resources.
1313 If we will make it after state is locked,
1314 we will have to recheck all again in any case.
1319 /* create new sock for complete connection */
1320 newsk = unix_create1(sock_net(sk), NULL, 0, sock->type);
1324 /* Allocate skb for sending to listening sock */
1325 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1330 /* Find listening sock. */
1331 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err);
1335 /* Latch state of peer */
1336 unix_state_lock(other);
1338 /* Apparently VFS overslept socket death. Retry. */
1339 if (sock_flag(other, SOCK_DEAD)) {
1340 unix_state_unlock(other);
1345 err = -ECONNREFUSED;
1346 if (other->sk_state != TCP_LISTEN)
1348 if (other->sk_shutdown & RCV_SHUTDOWN)
1351 if (unix_recvq_full(other)) {
1356 timeo = unix_wait_for_peer(other, timeo);
1358 err = sock_intr_errno(timeo);
1359 if (signal_pending(current))
1367 It is tricky place. We need to grab our state lock and cannot
1368 drop lock on peer. It is dangerous because deadlock is
1369 possible. Connect to self case and simultaneous
1370 attempt to connect are eliminated by checking socket
1371 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1372 check this before attempt to grab lock.
1374 Well, and we have to recheck the state after socket locked.
1380 /* This is ok... continue with connect */
1382 case TCP_ESTABLISHED:
1383 /* Socket is already connected */
1391 unix_state_lock_nested(sk);
1393 if (sk->sk_state != st) {
1394 unix_state_unlock(sk);
1395 unix_state_unlock(other);
1400 err = security_unix_stream_connect(sk, other, newsk);
1402 unix_state_unlock(sk);
1406 /* The way is open! Fastly set all the necessary fields... */
1409 unix_peer(newsk) = sk;
1410 newsk->sk_state = TCP_ESTABLISHED;
1411 newsk->sk_type = sk->sk_type;
1412 init_peercred(newsk);
1413 newu = unix_sk(newsk);
1414 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1415 otheru = unix_sk(other);
1417 /* copy address information from listening to new sock
1419 * The contents of *(otheru->addr) and otheru->path
1420 * are seen fully set up here, since we have found
1421 * otheru in hash under unix_table_lock. Insertion
1422 * into the hash chain we'd found it in had been done
1423 * in an earlier critical area protected by unix_table_lock,
1424 * the same one where we'd set *(otheru->addr) contents,
1425 * as well as otheru->path and otheru->addr itself.
1427 * Using smp_store_release() here to set newu->addr
1428 * is enough to make those stores, as well as stores
1429 * to newu->path visible to anyone who gets newu->addr
1430 * by smp_load_acquire(). IOW, the same warranties
1431 * as for unix_sock instances bound in unix_bind() or
1432 * in unix_autobind().
1434 if (otheru->path.dentry) {
1435 path_get(&otheru->path);
1436 newu->path = otheru->path;
1438 refcount_inc(&otheru->addr->refcnt);
1439 smp_store_release(&newu->addr, otheru->addr);
1441 /* Set credentials */
1442 copy_peercred(sk, other);
1444 sock->state = SS_CONNECTED;
1445 sk->sk_state = TCP_ESTABLISHED;
1448 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1449 unix_peer(sk) = newsk;
1451 unix_state_unlock(sk);
1453 /* take ten and send info to listening sock */
1454 spin_lock(&other->sk_receive_queue.lock);
1455 __skb_queue_tail(&other->sk_receive_queue, skb);
1456 spin_unlock(&other->sk_receive_queue.lock);
1457 unix_state_unlock(other);
1458 other->sk_data_ready(other);
1464 unix_state_unlock(other);
1469 unix_release_sock(newsk, 0);
1475 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1477 struct sock *ska = socka->sk, *skb = sockb->sk;
1479 /* Join our sockets back to back */
1482 unix_peer(ska) = skb;
1483 unix_peer(skb) = ska;
1487 ska->sk_state = TCP_ESTABLISHED;
1488 skb->sk_state = TCP_ESTABLISHED;
1489 socka->state = SS_CONNECTED;
1490 sockb->state = SS_CONNECTED;
1494 static void unix_sock_inherit_flags(const struct socket *old,
1497 if (test_bit(SOCK_PASSCRED, &old->flags))
1498 set_bit(SOCK_PASSCRED, &new->flags);
1499 if (test_bit(SOCK_PASSSEC, &old->flags))
1500 set_bit(SOCK_PASSSEC, &new->flags);
1503 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1506 struct sock *sk = sock->sk;
1508 struct sk_buff *skb;
1512 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1516 if (sk->sk_state != TCP_LISTEN)
1519 /* If socket state is TCP_LISTEN it cannot change (for now...),
1520 * so that no locks are necessary.
1523 skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1525 /* This means receive shutdown. */
1532 skb_free_datagram(sk, skb);
1533 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1535 /* attach accepted sock to socket */
1536 unix_state_lock(tsk);
1537 newsock->state = SS_CONNECTED;
1538 unix_sock_inherit_flags(sock, newsock);
1539 sock_graft(tsk, newsock);
1540 unix_state_unlock(tsk);
1548 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1550 struct sock *sk = sock->sk;
1551 struct unix_address *addr;
1552 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1556 sk = unix_peer_get(sk);
1566 addr = smp_load_acquire(&unix_sk(sk)->addr);
1568 sunaddr->sun_family = AF_UNIX;
1569 sunaddr->sun_path[0] = 0;
1570 err = sizeof(short);
1573 memcpy(sunaddr, addr->name, addr->len);
1580 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1582 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1585 * Garbage collection of unix sockets starts by selecting a set of
1586 * candidate sockets which have reference only from being in flight
1587 * (total_refs == inflight_refs). This condition is checked once during
1588 * the candidate collection phase, and candidates are marked as such, so
1589 * that non-candidates can later be ignored. While inflight_refs is
1590 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1591 * is an instantaneous decision.
1593 * Once a candidate, however, the socket must not be reinstalled into a
1594 * file descriptor while the garbage collection is in progress.
1596 * If the above conditions are met, then the directed graph of
1597 * candidates (*) does not change while unix_gc_lock is held.
1599 * Any operations that changes the file count through file descriptors
1600 * (dup, close, sendmsg) does not change the graph since candidates are
1601 * not installed in fds.
1603 * Dequeing a candidate via recvmsg would install it into an fd, but
1604 * that takes unix_gc_lock to decrement the inflight count, so it's
1605 * serialized with garbage collection.
1607 * MSG_PEEK is special in that it does not change the inflight count,
1608 * yet does install the socket into an fd. The following lock/unlock
1609 * pair is to ensure serialization with garbage collection. It must be
1610 * done between incrementing the file count and installing the file into
1613 * If garbage collection starts after the barrier provided by the
1614 * lock/unlock, then it will see the elevated refcount and not mark this
1615 * as a candidate. If a garbage collection is already in progress
1616 * before the file count was incremented, then the lock/unlock pair will
1617 * ensure that garbage collection is finished before progressing to
1618 * installing the fd.
1620 * (*) A -> B where B is on the queue of A or B is on the queue of C
1621 * which is on the queue of listening socket A.
1623 spin_lock(&unix_gc_lock);
1624 spin_unlock(&unix_gc_lock);
1627 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1631 UNIXCB(skb).pid = get_pid(scm->pid);
1632 UNIXCB(skb).uid = scm->creds.uid;
1633 UNIXCB(skb).gid = scm->creds.gid;
1634 UNIXCB(skb).fp = NULL;
1635 unix_get_secdata(scm, skb);
1636 if (scm->fp && send_fds)
1637 err = unix_attach_fds(scm, skb);
1639 skb->destructor = unix_destruct_scm;
1643 static bool unix_passcred_enabled(const struct socket *sock,
1644 const struct sock *other)
1646 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1647 !other->sk_socket ||
1648 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1652 * Some apps rely on write() giving SCM_CREDENTIALS
1653 * We include credentials if source or destination socket
1654 * asserted SOCK_PASSCRED.
1656 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1657 const struct sock *other)
1659 if (UNIXCB(skb).pid)
1661 if (unix_passcred_enabled(sock, other)) {
1662 UNIXCB(skb).pid = get_pid(task_tgid(current));
1663 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1667 static int maybe_init_creds(struct scm_cookie *scm,
1668 struct socket *socket,
1669 const struct sock *other)
1672 struct msghdr msg = { .msg_controllen = 0 };
1674 err = scm_send(socket, &msg, scm, false);
1678 if (unix_passcred_enabled(socket, other)) {
1679 scm->pid = get_pid(task_tgid(current));
1680 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1685 static bool unix_skb_scm_eq(struct sk_buff *skb,
1686 struct scm_cookie *scm)
1688 const struct unix_skb_parms *u = &UNIXCB(skb);
1690 return u->pid == scm->pid &&
1691 uid_eq(u->uid, scm->creds.uid) &&
1692 gid_eq(u->gid, scm->creds.gid) &&
1693 unix_secdata_eq(scm, skb);
1696 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1698 struct scm_fp_list *fp = UNIXCB(skb).fp;
1699 struct unix_sock *u = unix_sk(sk);
1701 if (unlikely(fp && fp->count))
1702 atomic_add(fp->count, &u->scm_stat.nr_fds);
1705 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1707 struct scm_fp_list *fp = UNIXCB(skb).fp;
1708 struct unix_sock *u = unix_sk(sk);
1710 if (unlikely(fp && fp->count))
1711 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1715 * Send AF_UNIX data.
1718 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1721 struct sock *sk = sock->sk;
1722 struct net *net = sock_net(sk);
1723 struct unix_sock *u = unix_sk(sk);
1724 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1725 struct sock *other = NULL;
1726 int namelen = 0; /* fake GCC */
1729 struct sk_buff *skb;
1731 struct scm_cookie scm;
1736 err = scm_send(sock, msg, &scm, false);
1741 if (msg->msg_flags&MSG_OOB)
1744 if (msg->msg_namelen) {
1745 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1752 other = unix_peer_get(sk);
1757 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr
1758 && (err = unix_autobind(sock)) != 0)
1762 if (len > sk->sk_sndbuf - 32)
1765 if (len > SKB_MAX_ALLOC) {
1766 data_len = min_t(size_t,
1767 len - SKB_MAX_ALLOC,
1768 MAX_SKB_FRAGS * PAGE_SIZE);
1769 data_len = PAGE_ALIGN(data_len);
1771 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1774 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1775 msg->msg_flags & MSG_DONTWAIT, &err,
1776 PAGE_ALLOC_COSTLY_ORDER);
1780 err = unix_scm_to_skb(&scm, skb, true);
1784 skb_put(skb, len - data_len);
1785 skb->data_len = data_len;
1787 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1791 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1796 if (sunaddr == NULL)
1799 other = unix_find_other(net, sunaddr, namelen, sk->sk_type,
1805 if (sk_filter(other, skb) < 0) {
1806 /* Toss the packet but do not return any error to the sender */
1812 unix_state_lock(other);
1815 if (!unix_may_send(sk, other))
1818 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1820 * Check with 1003.1g - what should
1823 unix_state_unlock(other);
1827 unix_state_lock(sk);
1830 if (unix_peer(sk) == other) {
1831 unix_peer(sk) = NULL;
1832 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1834 unix_state_unlock(sk);
1836 sk->sk_state = TCP_CLOSE;
1837 unix_dgram_disconnected(sk, other);
1839 err = -ECONNREFUSED;
1841 unix_state_unlock(sk);
1851 if (other->sk_shutdown & RCV_SHUTDOWN)
1854 if (sk->sk_type != SOCK_SEQPACKET) {
1855 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1860 /* other == sk && unix_peer(other) != sk if
1861 * - unix_peer(sk) == NULL, destination address bound to sk
1862 * - unix_peer(sk) == sk by time of get but disconnected before lock
1865 unlikely(unix_peer(other) != sk &&
1866 unix_recvq_full_lockless(other))) {
1868 timeo = unix_wait_for_peer(other, timeo);
1870 err = sock_intr_errno(timeo);
1871 if (signal_pending(current))
1878 unix_state_unlock(other);
1879 unix_state_double_lock(sk, other);
1882 if (unix_peer(sk) != other ||
1883 unix_dgram_peer_wake_me(sk, other)) {
1891 goto restart_locked;
1895 if (unlikely(sk_locked))
1896 unix_state_unlock(sk);
1898 if (sock_flag(other, SOCK_RCVTSTAMP))
1899 __net_timestamp(skb);
1900 maybe_add_creds(skb, sock, other);
1901 scm_stat_add(other, skb);
1902 skb_queue_tail(&other->sk_receive_queue, skb);
1903 unix_state_unlock(other);
1904 other->sk_data_ready(other);
1911 unix_state_unlock(sk);
1912 unix_state_unlock(other);
1922 /* We use paged skbs for stream sockets, and limit occupancy to 32768
1923 * bytes, and a minimum of a full page.
1925 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
1927 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
1928 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
1930 struct unix_sock *ousk = unix_sk(other);
1931 struct sk_buff *skb;
1934 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
1940 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
1947 unix_state_lock(other);
1949 if (sock_flag(other, SOCK_DEAD) ||
1950 (other->sk_shutdown & RCV_SHUTDOWN)) {
1951 unix_state_unlock(other);
1956 maybe_add_creds(skb, sock, other);
1960 consume_skb(ousk->oob_skb);
1962 ousk->oob_skb = skb;
1964 scm_stat_add(other, skb);
1965 skb_queue_tail(&other->sk_receive_queue, skb);
1966 sk_send_sigurg(other);
1967 unix_state_unlock(other);
1968 other->sk_data_ready(other);
1974 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
1977 struct sock *sk = sock->sk;
1978 struct sock *other = NULL;
1980 struct sk_buff *skb;
1982 struct scm_cookie scm;
1983 bool fds_sent = false;
1987 err = scm_send(sock, msg, &scm, false);
1992 if (msg->msg_flags & MSG_OOB) {
1993 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
2001 if (msg->msg_namelen) {
2002 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2006 other = unix_peer(sk);
2011 if (sk->sk_shutdown & SEND_SHUTDOWN)
2014 while (sent < len) {
2017 /* Keep two messages in the pipe so it schedules better */
2018 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2020 /* allow fallback to order-0 allocations */
2021 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2023 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2025 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2027 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2028 msg->msg_flags & MSG_DONTWAIT, &err,
2029 get_order(UNIX_SKB_FRAGS_SZ));
2033 /* Only send the fds in the first buffer */
2034 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2041 skb_put(skb, size - data_len);
2042 skb->data_len = data_len;
2044 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2050 unix_state_lock(other);
2052 if (sock_flag(other, SOCK_DEAD) ||
2053 (other->sk_shutdown & RCV_SHUTDOWN))
2056 maybe_add_creds(skb, sock, other);
2057 scm_stat_add(other, skb);
2058 skb_queue_tail(&other->sk_receive_queue, skb);
2059 unix_state_unlock(other);
2060 other->sk_data_ready(other);
2064 #if (IS_ENABLED(CONFIG_AF_UNIX_OOB))
2065 if (msg->msg_flags & MSG_OOB) {
2066 err = queue_oob(sock, msg, other);
2078 unix_state_unlock(other);
2081 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2082 send_sig(SIGPIPE, current, 0);
2086 return sent ? : err;
2089 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2090 int offset, size_t size, int flags)
2093 bool send_sigpipe = false;
2094 bool init_scm = true;
2095 struct scm_cookie scm;
2096 struct sock *other, *sk = socket->sk;
2097 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2099 if (flags & MSG_OOB)
2102 other = unix_peer(sk);
2103 if (!other || sk->sk_state != TCP_ESTABLISHED)
2108 unix_state_unlock(other);
2109 mutex_unlock(&unix_sk(other)->iolock);
2110 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2116 /* we must acquire iolock as we modify already present
2117 * skbs in the sk_receive_queue and mess with skb->len
2119 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2121 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2125 if (sk->sk_shutdown & SEND_SHUTDOWN) {
2127 send_sigpipe = true;
2131 unix_state_lock(other);
2133 if (sock_flag(other, SOCK_DEAD) ||
2134 other->sk_shutdown & RCV_SHUTDOWN) {
2136 send_sigpipe = true;
2137 goto err_state_unlock;
2141 err = maybe_init_creds(&scm, socket, other);
2143 goto err_state_unlock;
2147 skb = skb_peek_tail(&other->sk_receive_queue);
2148 if (tail && tail == skb) {
2150 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2157 } else if (newskb) {
2158 /* this is fast path, we don't necessarily need to
2159 * call to kfree_skb even though with newskb == NULL
2160 * this - does no harm
2162 consume_skb(newskb);
2166 if (skb_append_pagefrags(skb, page, offset, size)) {
2172 skb->data_len += size;
2173 skb->truesize += size;
2174 refcount_add(size, &sk->sk_wmem_alloc);
2177 err = unix_scm_to_skb(&scm, skb, false);
2179 goto err_state_unlock;
2180 spin_lock(&other->sk_receive_queue.lock);
2181 __skb_queue_tail(&other->sk_receive_queue, newskb);
2182 spin_unlock(&other->sk_receive_queue.lock);
2185 unix_state_unlock(other);
2186 mutex_unlock(&unix_sk(other)->iolock);
2188 other->sk_data_ready(other);
2193 unix_state_unlock(other);
2195 mutex_unlock(&unix_sk(other)->iolock);
2198 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2199 send_sig(SIGPIPE, current, 0);
2205 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2209 struct sock *sk = sock->sk;
2211 err = sock_error(sk);
2215 if (sk->sk_state != TCP_ESTABLISHED)
2218 if (msg->msg_namelen)
2219 msg->msg_namelen = 0;
2221 return unix_dgram_sendmsg(sock, msg, len);
2224 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2225 size_t size, int flags)
2227 struct sock *sk = sock->sk;
2229 if (sk->sk_state != TCP_ESTABLISHED)
2232 return unix_dgram_recvmsg(sock, msg, size, flags);
2235 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2237 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2240 msg->msg_namelen = addr->len;
2241 memcpy(msg->msg_name, addr->name, addr->len);
2245 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2248 struct scm_cookie scm;
2249 struct socket *sock = sk->sk_socket;
2250 struct unix_sock *u = unix_sk(sk);
2251 struct sk_buff *skb, *last;
2260 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2263 mutex_lock(&u->iolock);
2265 skip = sk_peek_offset(sk, flags);
2266 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2267 &skip, &err, &last);
2269 if (!(flags & MSG_PEEK))
2270 scm_stat_del(sk, skb);
2274 mutex_unlock(&u->iolock);
2279 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2280 &err, &timeo, last));
2282 if (!skb) { /* implies iolock unlocked */
2283 unix_state_lock(sk);
2284 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2285 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2286 (sk->sk_shutdown & RCV_SHUTDOWN))
2288 unix_state_unlock(sk);
2292 if (wq_has_sleeper(&u->peer_wait))
2293 wake_up_interruptible_sync_poll(&u->peer_wait,
2294 EPOLLOUT | EPOLLWRNORM |
2298 unix_copy_addr(msg, skb->sk);
2300 if (size > skb->len - skip)
2301 size = skb->len - skip;
2302 else if (size < skb->len - skip)
2303 msg->msg_flags |= MSG_TRUNC;
2305 err = skb_copy_datagram_msg(skb, skip, msg, size);
2309 if (sock_flag(sk, SOCK_RCVTSTAMP))
2310 __sock_recv_timestamp(msg, sk, skb);
2312 memset(&scm, 0, sizeof(scm));
2314 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2315 unix_set_secdata(&scm, skb);
2317 if (!(flags & MSG_PEEK)) {
2319 unix_detach_fds(&scm, skb);
2321 sk_peek_offset_bwd(sk, skb->len);
2323 /* It is questionable: on PEEK we could:
2324 - do not return fds - good, but too simple 8)
2325 - return fds, and do not return them on read (old strategy,
2327 - clone fds (I chose it for now, it is the most universal
2330 POSIX 1003.1g does not actually define this clearly
2331 at all. POSIX 1003.1g doesn't define a lot of things
2336 sk_peek_offset_fwd(sk, size);
2339 unix_peek_fds(&scm, skb);
2341 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2343 scm_recv(sock, msg, &scm, flags);
2346 skb_free_datagram(sk, skb);
2347 mutex_unlock(&u->iolock);
2352 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2355 struct sock *sk = sock->sk;
2357 #ifdef CONFIG_BPF_SYSCALL
2358 const struct proto *prot = READ_ONCE(sk->sk_prot);
2360 if (prot != &unix_dgram_proto)
2361 return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2362 flags & ~MSG_DONTWAIT, NULL);
2364 return __unix_dgram_recvmsg(sk, msg, size, flags);
2367 static int unix_read_sock(struct sock *sk, read_descriptor_t *desc,
2368 sk_read_actor_t recv_actor)
2373 struct unix_sock *u = unix_sk(sk);
2374 struct sk_buff *skb;
2377 mutex_lock(&u->iolock);
2378 skb = skb_recv_datagram(sk, 0, 1, &err);
2379 mutex_unlock(&u->iolock);
2383 used = recv_actor(desc, skb, 0, skb->len);
2389 } else if (used <= skb->len) {
2402 * Sleep until more data has arrived. But check for races..
2404 static long unix_stream_data_wait(struct sock *sk, long timeo,
2405 struct sk_buff *last, unsigned int last_len,
2408 struct sk_buff *tail;
2411 unix_state_lock(sk);
2414 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2416 tail = skb_peek_tail(&sk->sk_receive_queue);
2418 (tail && tail->len != last_len) ||
2420 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2421 signal_pending(current) ||
2425 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2426 unix_state_unlock(sk);
2428 timeo = freezable_schedule_timeout(timeo);
2430 timeo = schedule_timeout(timeo);
2431 unix_state_lock(sk);
2433 if (sock_flag(sk, SOCK_DEAD))
2436 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2439 finish_wait(sk_sleep(sk), &wait);
2440 unix_state_unlock(sk);
2444 static unsigned int unix_skb_len(const struct sk_buff *skb)
2446 return skb->len - UNIXCB(skb).consumed;
2449 struct unix_stream_read_state {
2450 int (*recv_actor)(struct sk_buff *, int, int,
2451 struct unix_stream_read_state *);
2452 struct socket *socket;
2454 struct pipe_inode_info *pipe;
2457 unsigned int splice_flags;
2460 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2461 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2463 struct socket *sock = state->socket;
2464 struct sock *sk = sock->sk;
2465 struct unix_sock *u = unix_sk(sk);
2467 struct sk_buff *oob_skb;
2469 mutex_lock(&u->iolock);
2470 unix_state_lock(sk);
2472 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2473 unix_state_unlock(sk);
2474 mutex_unlock(&u->iolock);
2478 oob_skb = u->oob_skb;
2480 if (!(state->flags & MSG_PEEK)) {
2484 unix_state_unlock(sk);
2486 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2488 if (!(state->flags & MSG_PEEK)) {
2489 UNIXCB(oob_skb).consumed += 1;
2493 mutex_unlock(&u->iolock);
2498 state->msg->msg_flags |= MSG_OOB;
2502 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2503 int flags, int copied)
2505 struct unix_sock *u = unix_sk(sk);
2507 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2508 skb_unlink(skb, &sk->sk_receive_queue);
2512 if (skb == u->oob_skb) {
2515 } else if (sock_flag(sk, SOCK_URGINLINE)) {
2516 if (!(flags & MSG_PEEK)) {
2520 } else if (!(flags & MSG_PEEK)) {
2521 skb_unlink(skb, &sk->sk_receive_queue);
2523 skb = skb_peek(&sk->sk_receive_queue);
2531 static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc,
2532 sk_read_actor_t recv_actor)
2534 if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2537 return unix_read_sock(sk, desc, recv_actor);
2540 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2543 struct scm_cookie scm;
2544 struct socket *sock = state->socket;
2545 struct sock *sk = sock->sk;
2546 struct unix_sock *u = unix_sk(sk);
2548 int flags = state->flags;
2549 int noblock = flags & MSG_DONTWAIT;
2550 bool check_creds = false;
2555 size_t size = state->size;
2556 unsigned int last_len;
2558 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2563 if (unlikely(flags & MSG_OOB)) {
2565 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2566 err = unix_stream_recv_urg(state);
2571 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2572 timeo = sock_rcvtimeo(sk, noblock);
2574 memset(&scm, 0, sizeof(scm));
2576 /* Lock the socket to prevent queue disordering
2577 * while sleeps in memcpy_tomsg
2579 mutex_lock(&u->iolock);
2581 skip = max(sk_peek_offset(sk, flags), 0);
2586 struct sk_buff *skb, *last;
2589 unix_state_lock(sk);
2590 if (sock_flag(sk, SOCK_DEAD)) {
2594 last = skb = skb_peek(&sk->sk_receive_queue);
2595 last_len = last ? last->len : 0;
2597 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2599 skb = manage_oob(skb, sk, flags, copied);
2601 unix_state_unlock(sk);
2610 if (copied >= target)
2614 * POSIX 1003.1g mandates this order.
2617 err = sock_error(sk);
2620 if (sk->sk_shutdown & RCV_SHUTDOWN)
2623 unix_state_unlock(sk);
2629 mutex_unlock(&u->iolock);
2631 timeo = unix_stream_data_wait(sk, timeo, last,
2632 last_len, freezable);
2634 if (signal_pending(current)) {
2635 err = sock_intr_errno(timeo);
2640 mutex_lock(&u->iolock);
2643 unix_state_unlock(sk);
2647 while (skip >= unix_skb_len(skb)) {
2648 skip -= unix_skb_len(skb);
2650 last_len = skb->len;
2651 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2656 unix_state_unlock(sk);
2659 /* Never glue messages from different writers */
2660 if (!unix_skb_scm_eq(skb, &scm))
2662 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2663 /* Copy credentials */
2664 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2665 unix_set_secdata(&scm, skb);
2669 /* Copy address just once */
2670 if (state->msg && state->msg->msg_name) {
2671 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2672 state->msg->msg_name);
2673 unix_copy_addr(state->msg, skb->sk);
2677 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2679 chunk = state->recv_actor(skb, skip, chunk, state);
2680 drop_skb = !unix_skb_len(skb);
2681 /* skb is only safe to use if !drop_skb */
2692 /* the skb was touched by a concurrent reader;
2693 * we should not expect anything from this skb
2694 * anymore and assume it invalid - we can be
2695 * sure it was dropped from the socket queue
2697 * let's report a short read
2703 /* Mark read part of skb as used */
2704 if (!(flags & MSG_PEEK)) {
2705 UNIXCB(skb).consumed += chunk;
2707 sk_peek_offset_bwd(sk, chunk);
2709 if (UNIXCB(skb).fp) {
2710 scm_stat_del(sk, skb);
2711 unix_detach_fds(&scm, skb);
2714 if (unix_skb_len(skb))
2717 skb_unlink(skb, &sk->sk_receive_queue);
2723 /* It is questionable, see note in unix_dgram_recvmsg.
2726 unix_peek_fds(&scm, skb);
2728 sk_peek_offset_fwd(sk, chunk);
2735 last_len = skb->len;
2736 unix_state_lock(sk);
2737 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2740 unix_state_unlock(sk);
2745 mutex_unlock(&u->iolock);
2747 scm_recv(sock, state->msg, &scm, flags);
2751 return copied ? : err;
2754 static int unix_stream_read_actor(struct sk_buff *skb,
2755 int skip, int chunk,
2756 struct unix_stream_read_state *state)
2760 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2762 return ret ?: chunk;
2765 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2766 size_t size, int flags)
2768 struct unix_stream_read_state state = {
2769 .recv_actor = unix_stream_read_actor,
2770 .socket = sk->sk_socket,
2776 return unix_stream_read_generic(&state, true);
2779 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2780 size_t size, int flags)
2782 struct unix_stream_read_state state = {
2783 .recv_actor = unix_stream_read_actor,
2790 #ifdef CONFIG_BPF_SYSCALL
2791 struct sock *sk = sock->sk;
2792 const struct proto *prot = READ_ONCE(sk->sk_prot);
2794 if (prot != &unix_stream_proto)
2795 return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
2796 flags & ~MSG_DONTWAIT, NULL);
2798 return unix_stream_read_generic(&state, true);
2801 static int unix_stream_splice_actor(struct sk_buff *skb,
2802 int skip, int chunk,
2803 struct unix_stream_read_state *state)
2805 return skb_splice_bits(skb, state->socket->sk,
2806 UNIXCB(skb).consumed + skip,
2807 state->pipe, chunk, state->splice_flags);
2810 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2811 struct pipe_inode_info *pipe,
2812 size_t size, unsigned int flags)
2814 struct unix_stream_read_state state = {
2815 .recv_actor = unix_stream_splice_actor,
2819 .splice_flags = flags,
2822 if (unlikely(*ppos))
2825 if (sock->file->f_flags & O_NONBLOCK ||
2826 flags & SPLICE_F_NONBLOCK)
2827 state.flags = MSG_DONTWAIT;
2829 return unix_stream_read_generic(&state, false);
2832 static int unix_shutdown(struct socket *sock, int mode)
2834 struct sock *sk = sock->sk;
2837 if (mode < SHUT_RD || mode > SHUT_RDWR)
2840 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2841 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2842 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2846 unix_state_lock(sk);
2847 sk->sk_shutdown |= mode;
2848 other = unix_peer(sk);
2851 unix_state_unlock(sk);
2852 sk->sk_state_change(sk);
2855 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2858 const struct proto *prot = READ_ONCE(other->sk_prot);
2861 prot->unhash(other);
2862 if (mode&RCV_SHUTDOWN)
2863 peer_mode |= SEND_SHUTDOWN;
2864 if (mode&SEND_SHUTDOWN)
2865 peer_mode |= RCV_SHUTDOWN;
2866 unix_state_lock(other);
2867 other->sk_shutdown |= peer_mode;
2868 unix_state_unlock(other);
2869 other->sk_state_change(other);
2870 if (peer_mode == SHUTDOWN_MASK) {
2871 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
2872 other->sk_state = TCP_CLOSE;
2873 } else if (peer_mode & RCV_SHUTDOWN) {
2874 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
2883 long unix_inq_len(struct sock *sk)
2885 struct sk_buff *skb;
2888 if (sk->sk_state == TCP_LISTEN)
2891 spin_lock(&sk->sk_receive_queue.lock);
2892 if (sk->sk_type == SOCK_STREAM ||
2893 sk->sk_type == SOCK_SEQPACKET) {
2894 skb_queue_walk(&sk->sk_receive_queue, skb)
2895 amount += unix_skb_len(skb);
2897 skb = skb_peek(&sk->sk_receive_queue);
2901 spin_unlock(&sk->sk_receive_queue.lock);
2905 EXPORT_SYMBOL_GPL(unix_inq_len);
2907 long unix_outq_len(struct sock *sk)
2909 return sk_wmem_alloc_get(sk);
2911 EXPORT_SYMBOL_GPL(unix_outq_len);
2913 static int unix_open_file(struct sock *sk)
2919 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2922 if (!smp_load_acquire(&unix_sk(sk)->addr))
2925 path = unix_sk(sk)->path;
2931 fd = get_unused_fd_flags(O_CLOEXEC);
2935 f = dentry_open(&path, O_PATH, current_cred());
2949 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2951 struct sock *sk = sock->sk;
2957 amount = unix_outq_len(sk);
2958 err = put_user(amount, (int __user *)arg);
2961 amount = unix_inq_len(sk);
2965 err = put_user(amount, (int __user *)arg);
2968 err = unix_open_file(sk);
2970 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2973 struct sk_buff *skb;
2974 struct unix_sock *u = unix_sk(sk);
2977 skb = skb_peek(&sk->sk_receive_queue);
2978 if (skb && skb == u->oob_skb)
2980 err = put_user(answ, (int __user *)arg);
2991 #ifdef CONFIG_COMPAT
2992 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2994 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
2998 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3000 struct sock *sk = sock->sk;
3003 sock_poll_wait(file, sock, wait);
3006 /* exceptional events? */
3009 if (sk->sk_shutdown == SHUTDOWN_MASK)
3011 if (sk->sk_shutdown & RCV_SHUTDOWN)
3012 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3015 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3016 mask |= EPOLLIN | EPOLLRDNORM;
3018 /* Connection-based need to check for termination and startup */
3019 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3020 sk->sk_state == TCP_CLOSE)
3024 * we set writable also when the other side has shut down the
3025 * connection. This prevents stuck sockets.
3027 if (unix_writable(sk))
3028 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3033 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3036 struct sock *sk = sock->sk, *other;
3037 unsigned int writable;
3040 sock_poll_wait(file, sock, wait);
3043 /* exceptional events? */
3044 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3046 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3048 if (sk->sk_shutdown & RCV_SHUTDOWN)
3049 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3050 if (sk->sk_shutdown == SHUTDOWN_MASK)
3054 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3055 mask |= EPOLLIN | EPOLLRDNORM;
3057 /* Connection-based need to check for termination and startup */
3058 if (sk->sk_type == SOCK_SEQPACKET) {
3059 if (sk->sk_state == TCP_CLOSE)
3061 /* connection hasn't started yet? */
3062 if (sk->sk_state == TCP_SYN_SENT)
3066 /* No write status requested, avoid expensive OUT tests. */
3067 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3070 writable = unix_writable(sk);
3072 unix_state_lock(sk);
3074 other = unix_peer(sk);
3075 if (other && unix_peer(other) != sk &&
3076 unix_recvq_full(other) &&
3077 unix_dgram_peer_wake_me(sk, other))
3080 unix_state_unlock(sk);
3084 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3086 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3091 #ifdef CONFIG_PROC_FS
3093 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3095 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3096 #define get_offset(x) ((x) & ((1L << BUCKET_SPACE) - 1))
3097 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3099 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3101 unsigned long offset = get_offset(*pos);
3102 unsigned long bucket = get_bucket(*pos);
3104 unsigned long count = 0;
3106 for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) {
3107 if (sock_net(sk) != seq_file_net(seq))
3109 if (++count == offset)
3116 static struct sock *unix_next_socket(struct seq_file *seq,
3120 unsigned long bucket;
3122 while (sk > (struct sock *)SEQ_START_TOKEN) {
3126 if (sock_net(sk) == seq_file_net(seq))
3131 sk = unix_from_bucket(seq, pos);
3136 bucket = get_bucket(*pos) + 1;
3137 *pos = set_bucket_offset(bucket, 1);
3138 } while (bucket < ARRAY_SIZE(unix_socket_table));
3143 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3144 __acquires(unix_table_lock)
3146 spin_lock(&unix_table_lock);
3149 return SEQ_START_TOKEN;
3151 if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table))
3154 return unix_next_socket(seq, NULL, pos);
3157 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3160 return unix_next_socket(seq, v, pos);
3163 static void unix_seq_stop(struct seq_file *seq, void *v)
3164 __releases(unix_table_lock)
3166 spin_unlock(&unix_table_lock);
3169 static int unix_seq_show(struct seq_file *seq, void *v)
3172 if (v == SEQ_START_TOKEN)
3173 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3177 struct unix_sock *u = unix_sk(s);
3180 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3182 refcount_read(&s->sk_refcnt),
3184 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3187 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3188 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3191 if (u->addr) { // under unix_table_lock here
3196 len = u->addr->len - sizeof(short);
3197 if (!UNIX_ABSTRACT(s))
3203 for ( ; i < len; i++)
3204 seq_putc(seq, u->addr->name->sun_path[i] ?:
3207 unix_state_unlock(s);
3208 seq_putc(seq, '\n');
3214 static const struct seq_operations unix_seq_ops = {
3215 .start = unix_seq_start,
3216 .next = unix_seq_next,
3217 .stop = unix_seq_stop,
3218 .show = unix_seq_show,
3221 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3222 struct bpf_iter__unix {
3223 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3224 __bpf_md_ptr(struct unix_sock *, unix_sk);
3225 uid_t uid __aligned(8);
3228 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3229 struct unix_sock *unix_sk, uid_t uid)
3231 struct bpf_iter__unix ctx;
3233 meta->seq_num--; /* skip SEQ_START_TOKEN */
3235 ctx.unix_sk = unix_sk;
3237 return bpf_iter_run_prog(prog, &ctx);
3240 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3242 struct bpf_iter_meta meta;
3243 struct bpf_prog *prog;
3244 struct sock *sk = v;
3247 if (v == SEQ_START_TOKEN)
3250 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3252 prog = bpf_iter_get_info(&meta, false);
3253 return unix_prog_seq_show(prog, &meta, v, uid);
3256 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3258 struct bpf_iter_meta meta;
3259 struct bpf_prog *prog;
3263 prog = bpf_iter_get_info(&meta, true);
3265 (void)unix_prog_seq_show(prog, &meta, v, 0);
3268 unix_seq_stop(seq, v);
3271 static const struct seq_operations bpf_iter_unix_seq_ops = {
3272 .start = unix_seq_start,
3273 .next = unix_seq_next,
3274 .stop = bpf_iter_unix_seq_stop,
3275 .show = bpf_iter_unix_seq_show,
3280 static const struct net_proto_family unix_family_ops = {
3282 .create = unix_create,
3283 .owner = THIS_MODULE,
3287 static int __net_init unix_net_init(struct net *net)
3289 int error = -ENOMEM;
3291 net->unx.sysctl_max_dgram_qlen = 10;
3292 if (unix_sysctl_register(net))
3295 #ifdef CONFIG_PROC_FS
3296 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3297 sizeof(struct seq_net_private))) {
3298 unix_sysctl_unregister(net);
3307 static void __net_exit unix_net_exit(struct net *net)
3309 unix_sysctl_unregister(net);
3310 remove_proc_entry("unix", net->proc_net);
3313 static struct pernet_operations unix_net_ops = {
3314 .init = unix_net_init,
3315 .exit = unix_net_exit,
3318 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3319 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3320 struct unix_sock *unix_sk, uid_t uid)
3322 static const struct bpf_iter_seq_info unix_seq_info = {
3323 .seq_ops = &bpf_iter_unix_seq_ops,
3324 .init_seq_private = bpf_iter_init_seq_net,
3325 .fini_seq_private = bpf_iter_fini_seq_net,
3326 .seq_priv_size = sizeof(struct seq_net_private),
3329 static struct bpf_iter_reg unix_reg_info = {
3331 .ctx_arg_info_size = 1,
3333 { offsetof(struct bpf_iter__unix, unix_sk),
3334 PTR_TO_BTF_ID_OR_NULL },
3336 .seq_info = &unix_seq_info,
3339 static void __init bpf_iter_register(void)
3341 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3342 if (bpf_iter_reg_target(&unix_reg_info))
3343 pr_warn("Warning: could not register bpf iterator unix\n");
3347 static int __init af_unix_init(void)
3351 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3353 rc = proto_register(&unix_dgram_proto, 1);
3355 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3359 rc = proto_register(&unix_stream_proto, 1);
3361 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3365 sock_register(&unix_family_ops);
3366 register_pernet_subsys(&unix_net_ops);
3367 unix_bpf_build_proto();
3369 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3370 bpf_iter_register();
3377 static void __exit af_unix_exit(void)
3379 sock_unregister(PF_UNIX);
3380 proto_unregister(&unix_dgram_proto);
3381 proto_unregister(&unix_stream_proto);
3382 unregister_pernet_subsys(&unix_net_ops);
3385 /* Earlier than device_initcall() so that other drivers invoking
3386 request_module() don't end up in a loop when modprobe tries
3387 to use a UNIX socket. But later than subsys_initcall() because
3388 we depend on stuff initialised there */
3389 fs_initcall(af_unix_init);
3390 module_exit(af_unix_exit);
3392 MODULE_LICENSE("GPL");
3393 MODULE_ALIAS_NETPROTO(PF_UNIX);