1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/module.h>
81 #include <linux/kernel.h>
82 #include <linux/signal.h>
83 #include <linux/sched/signal.h>
84 #include <linux/errno.h>
85 #include <linux/string.h>
86 #include <linux/stat.h>
87 #include <linux/dcache.h>
88 #include <linux/namei.h>
89 #include <linux/socket.h>
91 #include <linux/fcntl.h>
92 #include <linux/filter.h>
93 #include <linux/termios.h>
94 #include <linux/sockios.h>
95 #include <linux/net.h>
98 #include <linux/slab.h>
99 #include <linux/uaccess.h>
100 #include <linux/skbuff.h>
101 #include <linux/netdevice.h>
102 #include <net/net_namespace.h>
103 #include <net/sock.h>
104 #include <net/tcp_states.h>
105 #include <net/af_unix.h>
106 #include <linux/proc_fs.h>
107 #include <linux/seq_file.h>
109 #include <linux/init.h>
110 #include <linux/poll.h>
111 #include <linux/rtnetlink.h>
112 #include <linux/mount.h>
113 #include <net/checksum.h>
114 #include <linux/security.h>
115 #include <linux/freezer.h>
116 #include <linux/file.h>
117 #include <linux/btf_ids.h>
121 static atomic_long_t unix_nr_socks;
122 static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
123 static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
125 /* SMP locking strategy:
126 * hash table is protected with spinlock.
127 * each socket state is protected by separate spinlock.
130 static unsigned int unix_unbound_hash(struct sock *sk)
132 unsigned long hash = (unsigned long)sk;
138 return hash & UNIX_HASH_MOD;
141 static unsigned int unix_bsd_hash(struct inode *i)
143 return i->i_ino & UNIX_HASH_MOD;
146 static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr,
147 int addr_len, int type)
149 __wsum csum = csum_partial(sunaddr, addr_len, 0);
152 hash = (__force unsigned int)csum_fold(csum);
156 return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD);
159 static void unix_table_double_lock(struct net *net,
160 unsigned int hash1, unsigned int hash2)
162 if (hash1 == hash2) {
163 spin_lock(&net->unx.table.locks[hash1]);
170 spin_lock(&net->unx.table.locks[hash1]);
171 spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING);
174 static void unix_table_double_unlock(struct net *net,
175 unsigned int hash1, unsigned int hash2)
177 if (hash1 == hash2) {
178 spin_unlock(&net->unx.table.locks[hash1]);
182 spin_unlock(&net->unx.table.locks[hash1]);
183 spin_unlock(&net->unx.table.locks[hash2]);
186 #ifdef CONFIG_SECURITY_NETWORK
187 static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
189 UNIXCB(skb).secid = scm->secid;
192 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
194 scm->secid = UNIXCB(skb).secid;
197 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
199 return (scm->secid == UNIXCB(skb).secid);
202 static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb)
205 static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb)
208 static inline bool unix_secdata_eq(struct scm_cookie *scm, struct sk_buff *skb)
212 #endif /* CONFIG_SECURITY_NETWORK */
214 #define unix_peer(sk) (unix_sk(sk)->peer)
216 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
218 return unix_peer(osk) == sk;
221 static inline int unix_may_send(struct sock *sk, struct sock *osk)
223 return unix_peer(osk) == NULL || unix_our_peer(sk, osk);
226 static inline int unix_recvq_full(const struct sock *sk)
228 return skb_queue_len(&sk->sk_receive_queue) > sk->sk_max_ack_backlog;
231 static inline int unix_recvq_full_lockless(const struct sock *sk)
233 return skb_queue_len_lockless(&sk->sk_receive_queue) >
234 READ_ONCE(sk->sk_max_ack_backlog);
237 struct sock *unix_peer_get(struct sock *s)
245 unix_state_unlock(s);
248 EXPORT_SYMBOL_GPL(unix_peer_get);
250 static struct unix_address *unix_create_addr(struct sockaddr_un *sunaddr,
253 struct unix_address *addr;
255 addr = kmalloc(sizeof(*addr) + addr_len, GFP_KERNEL);
259 refcount_set(&addr->refcnt, 1);
260 addr->len = addr_len;
261 memcpy(addr->name, sunaddr, addr_len);
266 static inline void unix_release_addr(struct unix_address *addr)
268 if (refcount_dec_and_test(&addr->refcnt))
273 * Check unix socket name:
274 * - should be not zero length.
275 * - if started by not zero, should be NULL terminated (FS object)
276 * - if started by zero, it is abstract name.
279 static int unix_validate_addr(struct sockaddr_un *sunaddr, int addr_len)
281 if (addr_len <= offsetof(struct sockaddr_un, sun_path) ||
282 addr_len > sizeof(*sunaddr))
285 if (sunaddr->sun_family != AF_UNIX)
291 static void unix_mkname_bsd(struct sockaddr_un *sunaddr, int addr_len)
293 /* This may look like an off by one error but it is a bit more
294 * subtle. 108 is the longest valid AF_UNIX path for a binding.
295 * sun_path[108] doesn't as such exist. However in kernel space
296 * we are guaranteed that it is a valid memory location in our
297 * kernel address buffer because syscall functions always pass
298 * a pointer of struct sockaddr_storage which has a bigger buffer
301 ((char *)sunaddr)[addr_len] = 0;
304 static void __unix_remove_socket(struct sock *sk)
306 sk_del_node_init(sk);
309 static void __unix_insert_socket(struct net *net, struct sock *sk)
311 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
312 sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]);
315 static void __unix_set_addr_hash(struct net *net, struct sock *sk,
316 struct unix_address *addr, unsigned int hash)
318 __unix_remove_socket(sk);
319 smp_store_release(&unix_sk(sk)->addr, addr);
322 __unix_insert_socket(net, sk);
325 static void unix_remove_socket(struct net *net, struct sock *sk)
327 spin_lock(&net->unx.table.locks[sk->sk_hash]);
328 __unix_remove_socket(sk);
329 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
332 static void unix_insert_unbound_socket(struct net *net, struct sock *sk)
334 spin_lock(&net->unx.table.locks[sk->sk_hash]);
335 __unix_insert_socket(net, sk);
336 spin_unlock(&net->unx.table.locks[sk->sk_hash]);
339 static void unix_insert_bsd_socket(struct sock *sk)
341 spin_lock(&bsd_socket_locks[sk->sk_hash]);
342 sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]);
343 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
346 static void unix_remove_bsd_socket(struct sock *sk)
348 if (!hlist_unhashed(&sk->sk_bind_node)) {
349 spin_lock(&bsd_socket_locks[sk->sk_hash]);
350 __sk_del_bind_node(sk);
351 spin_unlock(&bsd_socket_locks[sk->sk_hash]);
353 sk_node_init(&sk->sk_bind_node);
357 static struct sock *__unix_find_socket_byname(struct net *net,
358 struct sockaddr_un *sunname,
359 int len, unsigned int hash)
363 sk_for_each(s, &net->unx.table.buckets[hash]) {
364 struct unix_sock *u = unix_sk(s);
366 if (u->addr->len == len &&
367 !memcmp(u->addr->name, sunname, len))
373 static inline struct sock *unix_find_socket_byname(struct net *net,
374 struct sockaddr_un *sunname,
375 int len, unsigned int hash)
379 spin_lock(&net->unx.table.locks[hash]);
380 s = __unix_find_socket_byname(net, sunname, len, hash);
383 spin_unlock(&net->unx.table.locks[hash]);
387 static struct sock *unix_find_socket_byinode(struct inode *i)
389 unsigned int hash = unix_bsd_hash(i);
392 spin_lock(&bsd_socket_locks[hash]);
393 sk_for_each_bound(s, &bsd_socket_buckets[hash]) {
394 struct dentry *dentry = unix_sk(s)->path.dentry;
396 if (dentry && d_backing_inode(dentry) == i) {
398 spin_unlock(&bsd_socket_locks[hash]);
402 spin_unlock(&bsd_socket_locks[hash]);
406 /* Support code for asymmetrically connected dgram sockets
408 * If a datagram socket is connected to a socket not itself connected
409 * to the first socket (eg, /dev/log), clients may only enqueue more
410 * messages if the present receive queue of the server socket is not
411 * "too large". This means there's a second writeability condition
412 * poll and sendmsg need to test. The dgram recv code will do a wake
413 * up on the peer_wait wait queue of a socket upon reception of a
414 * datagram which needs to be propagated to sleeping would-be writers
415 * since these might not have sent anything so far. This can't be
416 * accomplished via poll_wait because the lifetime of the server
417 * socket might be less than that of its clients if these break their
418 * association with it or if the server socket is closed while clients
419 * are still connected to it and there's no way to inform "a polling
420 * implementation" that it should let go of a certain wait queue
422 * In order to propagate a wake up, a wait_queue_entry_t of the client
423 * socket is enqueued on the peer_wait queue of the server socket
424 * whose wake function does a wake_up on the ordinary client socket
425 * wait queue. This connection is established whenever a write (or
426 * poll for write) hit the flow control condition and broken when the
427 * association to the server socket is dissolved or after a wake up
431 static int unix_dgram_peer_wake_relay(wait_queue_entry_t *q, unsigned mode, int flags,
435 wait_queue_head_t *u_sleep;
437 u = container_of(q, struct unix_sock, peer_wake);
439 __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
441 u->peer_wake.private = NULL;
443 /* relaying can only happen while the wq still exists */
444 u_sleep = sk_sleep(&u->sk);
446 wake_up_interruptible_poll(u_sleep, key_to_poll(key));
451 static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
453 struct unix_sock *u, *u_other;
457 u_other = unix_sk(other);
459 spin_lock(&u_other->peer_wait.lock);
461 if (!u->peer_wake.private) {
462 u->peer_wake.private = other;
463 __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
468 spin_unlock(&u_other->peer_wait.lock);
472 static void unix_dgram_peer_wake_disconnect(struct sock *sk,
475 struct unix_sock *u, *u_other;
478 u_other = unix_sk(other);
479 spin_lock(&u_other->peer_wait.lock);
481 if (u->peer_wake.private == other) {
482 __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
483 u->peer_wake.private = NULL;
486 spin_unlock(&u_other->peer_wait.lock);
489 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
492 unix_dgram_peer_wake_disconnect(sk, other);
493 wake_up_interruptible_poll(sk_sleep(sk),
500 * - unix_peer(sk) == other
501 * - association is stable
503 static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
507 connected = unix_dgram_peer_wake_connect(sk, other);
509 /* If other is SOCK_DEAD, we want to make sure we signal
510 * POLLOUT, such that a subsequent write() can get a
511 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
512 * to other and its full, we will hang waiting for POLLOUT.
514 if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD))
518 unix_dgram_peer_wake_disconnect(sk, other);
523 static int unix_writable(const struct sock *sk)
525 return sk->sk_state != TCP_LISTEN &&
526 (refcount_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
529 static void unix_write_space(struct sock *sk)
531 struct socket_wq *wq;
534 if (unix_writable(sk)) {
535 wq = rcu_dereference(sk->sk_wq);
536 if (skwq_has_sleeper(wq))
537 wake_up_interruptible_sync_poll(&wq->wait,
538 EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND);
539 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
544 /* When dgram socket disconnects (or changes its peer), we clear its receive
545 * queue of packets arrived from previous peer. First, it allows to do
546 * flow control based only on wmem_alloc; second, sk connected to peer
547 * may receive messages only from that peer. */
548 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
550 if (!skb_queue_empty(&sk->sk_receive_queue)) {
551 skb_queue_purge(&sk->sk_receive_queue);
552 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
554 /* If one link of bidirectional dgram pipe is disconnected,
555 * we signal error. Messages are lost. Do not make this,
556 * when peer was not connected to us.
558 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
559 other->sk_err = ECONNRESET;
560 sk_error_report(other);
563 other->sk_state = TCP_CLOSE;
566 static void unix_sock_destructor(struct sock *sk)
568 struct unix_sock *u = unix_sk(sk);
570 skb_queue_purge(&sk->sk_receive_queue);
572 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
574 kfree_skb(u->oob_skb);
578 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc));
579 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk));
580 DEBUG_NET_WARN_ON_ONCE(sk->sk_socket);
581 if (!sock_flag(sk, SOCK_DEAD)) {
582 pr_info("Attempt to release alive unix socket: %p\n", sk);
587 unix_release_addr(u->addr);
589 atomic_long_dec(&unix_nr_socks);
590 sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
591 #ifdef UNIX_REFCNT_DEBUG
592 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk,
593 atomic_long_read(&unix_nr_socks));
597 static void unix_release_sock(struct sock *sk, int embrion)
599 struct unix_sock *u = unix_sk(sk);
605 unix_remove_socket(sock_net(sk), sk);
606 unix_remove_bsd_socket(sk);
611 sk->sk_shutdown = SHUTDOWN_MASK;
613 u->path.dentry = NULL;
615 state = sk->sk_state;
616 sk->sk_state = TCP_CLOSE;
618 skpair = unix_peer(sk);
619 unix_peer(sk) = NULL;
621 unix_state_unlock(sk);
623 wake_up_interruptible_all(&u->peer_wait);
625 if (skpair != NULL) {
626 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
627 unix_state_lock(skpair);
629 skpair->sk_shutdown = SHUTDOWN_MASK;
630 if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
631 skpair->sk_err = ECONNRESET;
632 unix_state_unlock(skpair);
633 skpair->sk_state_change(skpair);
634 sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
637 unix_dgram_peer_wake_disconnect(sk, skpair);
638 sock_put(skpair); /* It may now die */
641 /* Try to flush out this socket. Throw out buffers at least */
643 while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
644 if (state == TCP_LISTEN)
645 unix_release_sock(skb->sk, 1);
646 /* passed fds are erased in the kfree_skb hook */
647 UNIXCB(skb).consumed = skb->len;
656 /* ---- Socket is dead now and most probably destroyed ---- */
659 * Fixme: BSD difference: In BSD all sockets connected to us get
660 * ECONNRESET and we die on the spot. In Linux we behave
661 * like files and pipes do and wait for the last
664 * Can't we simply set sock->err?
666 * What the above comment does talk about? --ANK(980817)
669 if (unix_tot_inflight)
670 unix_gc(); /* Garbage collect fds */
673 static void init_peercred(struct sock *sk)
675 const struct cred *old_cred;
678 spin_lock(&sk->sk_peer_lock);
679 old_pid = sk->sk_peer_pid;
680 old_cred = sk->sk_peer_cred;
681 sk->sk_peer_pid = get_pid(task_tgid(current));
682 sk->sk_peer_cred = get_current_cred();
683 spin_unlock(&sk->sk_peer_lock);
689 static void copy_peercred(struct sock *sk, struct sock *peersk)
691 const struct cred *old_cred;
695 spin_lock(&sk->sk_peer_lock);
696 spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING);
698 spin_lock(&peersk->sk_peer_lock);
699 spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING);
701 old_pid = sk->sk_peer_pid;
702 old_cred = sk->sk_peer_cred;
703 sk->sk_peer_pid = get_pid(peersk->sk_peer_pid);
704 sk->sk_peer_cred = get_cred(peersk->sk_peer_cred);
706 spin_unlock(&sk->sk_peer_lock);
707 spin_unlock(&peersk->sk_peer_lock);
713 static int unix_listen(struct socket *sock, int backlog)
716 struct sock *sk = sock->sk;
717 struct unix_sock *u = unix_sk(sk);
720 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
721 goto out; /* Only stream/seqpacket sockets accept */
724 goto out; /* No listens on an unbound socket */
726 if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
728 if (backlog > sk->sk_max_ack_backlog)
729 wake_up_interruptible_all(&u->peer_wait);
730 sk->sk_max_ack_backlog = backlog;
731 sk->sk_state = TCP_LISTEN;
732 /* set credentials so connect can copy them */
737 unix_state_unlock(sk);
742 static int unix_release(struct socket *);
743 static int unix_bind(struct socket *, struct sockaddr *, int);
744 static int unix_stream_connect(struct socket *, struct sockaddr *,
745 int addr_len, int flags);
746 static int unix_socketpair(struct socket *, struct socket *);
747 static int unix_accept(struct socket *, struct socket *, int, bool);
748 static int unix_getname(struct socket *, struct sockaddr *, int);
749 static __poll_t unix_poll(struct file *, struct socket *, poll_table *);
750 static __poll_t unix_dgram_poll(struct file *, struct socket *,
752 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
754 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg);
756 static int unix_shutdown(struct socket *, int);
757 static int unix_stream_sendmsg(struct socket *, struct msghdr *, size_t);
758 static int unix_stream_recvmsg(struct socket *, struct msghdr *, size_t, int);
759 static ssize_t unix_stream_sendpage(struct socket *, struct page *, int offset,
760 size_t size, int flags);
761 static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos,
762 struct pipe_inode_info *, size_t size,
764 static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t);
765 static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int);
766 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
767 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor);
768 static int unix_dgram_connect(struct socket *, struct sockaddr *,
770 static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t);
771 static int unix_seqpacket_recvmsg(struct socket *, struct msghdr *, size_t,
774 static int unix_set_peek_off(struct sock *sk, int val)
776 struct unix_sock *u = unix_sk(sk);
778 if (mutex_lock_interruptible(&u->iolock))
781 sk->sk_peek_off = val;
782 mutex_unlock(&u->iolock);
787 #ifdef CONFIG_PROC_FS
788 static void unix_show_fdinfo(struct seq_file *m, struct socket *sock)
790 struct sock *sk = sock->sk;
794 u = unix_sk(sock->sk);
795 seq_printf(m, "scm_fds: %u\n",
796 atomic_read(&u->scm_stat.nr_fds));
800 #define unix_show_fdinfo NULL
803 static const struct proto_ops unix_stream_ops = {
805 .owner = THIS_MODULE,
806 .release = unix_release,
808 .connect = unix_stream_connect,
809 .socketpair = unix_socketpair,
810 .accept = unix_accept,
811 .getname = unix_getname,
815 .compat_ioctl = unix_compat_ioctl,
817 .listen = unix_listen,
818 .shutdown = unix_shutdown,
819 .sendmsg = unix_stream_sendmsg,
820 .recvmsg = unix_stream_recvmsg,
821 .read_skb = unix_stream_read_skb,
822 .mmap = sock_no_mmap,
823 .sendpage = unix_stream_sendpage,
824 .splice_read = unix_stream_splice_read,
825 .set_peek_off = unix_set_peek_off,
826 .show_fdinfo = unix_show_fdinfo,
829 static const struct proto_ops unix_dgram_ops = {
831 .owner = THIS_MODULE,
832 .release = unix_release,
834 .connect = unix_dgram_connect,
835 .socketpair = unix_socketpair,
836 .accept = sock_no_accept,
837 .getname = unix_getname,
838 .poll = unix_dgram_poll,
841 .compat_ioctl = unix_compat_ioctl,
843 .listen = sock_no_listen,
844 .shutdown = unix_shutdown,
845 .sendmsg = unix_dgram_sendmsg,
846 .read_skb = unix_read_skb,
847 .recvmsg = unix_dgram_recvmsg,
848 .mmap = sock_no_mmap,
849 .sendpage = sock_no_sendpage,
850 .set_peek_off = unix_set_peek_off,
851 .show_fdinfo = unix_show_fdinfo,
854 static const struct proto_ops unix_seqpacket_ops = {
856 .owner = THIS_MODULE,
857 .release = unix_release,
859 .connect = unix_stream_connect,
860 .socketpair = unix_socketpair,
861 .accept = unix_accept,
862 .getname = unix_getname,
863 .poll = unix_dgram_poll,
866 .compat_ioctl = unix_compat_ioctl,
868 .listen = unix_listen,
869 .shutdown = unix_shutdown,
870 .sendmsg = unix_seqpacket_sendmsg,
871 .recvmsg = unix_seqpacket_recvmsg,
872 .mmap = sock_no_mmap,
873 .sendpage = sock_no_sendpage,
874 .set_peek_off = unix_set_peek_off,
875 .show_fdinfo = unix_show_fdinfo,
878 static void unix_close(struct sock *sk, long timeout)
880 /* Nothing to do here, unix socket does not need a ->close().
881 * This is merely for sockmap.
885 static void unix_unhash(struct sock *sk)
887 /* Nothing to do here, unix socket does not need a ->unhash().
888 * This is merely for sockmap.
892 struct proto unix_dgram_proto = {
894 .owner = THIS_MODULE,
895 .obj_size = sizeof(struct unix_sock),
897 #ifdef CONFIG_BPF_SYSCALL
898 .psock_update_sk_prot = unix_dgram_bpf_update_proto,
902 struct proto unix_stream_proto = {
903 .name = "UNIX-STREAM",
904 .owner = THIS_MODULE,
905 .obj_size = sizeof(struct unix_sock),
907 .unhash = unix_unhash,
908 #ifdef CONFIG_BPF_SYSCALL
909 .psock_update_sk_prot = unix_stream_bpf_update_proto,
913 static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type)
919 atomic_long_inc(&unix_nr_socks);
920 if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) {
925 if (type == SOCK_STREAM)
926 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern);
927 else /*dgram and seqpacket */
928 sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern);
935 sock_init_data(sock, sk);
937 sk->sk_hash = unix_unbound_hash(sk);
938 sk->sk_allocation = GFP_KERNEL_ACCOUNT;
939 sk->sk_write_space = unix_write_space;
940 sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
941 sk->sk_destruct = unix_sock_destructor;
943 u->path.dentry = NULL;
945 spin_lock_init(&u->lock);
946 atomic_long_set(&u->inflight, 0);
947 INIT_LIST_HEAD(&u->link);
948 mutex_init(&u->iolock); /* single task reading lock */
949 mutex_init(&u->bindlock); /* single task binding lock */
950 init_waitqueue_head(&u->peer_wait);
951 init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
952 memset(&u->scm_stat, 0, sizeof(struct scm_stat));
953 unix_insert_unbound_socket(net, sk);
955 sock_prot_inuse_add(net, sk->sk_prot, 1);
960 atomic_long_dec(&unix_nr_socks);
964 static int unix_create(struct net *net, struct socket *sock, int protocol,
969 if (protocol && protocol != PF_UNIX)
970 return -EPROTONOSUPPORT;
972 sock->state = SS_UNCONNECTED;
974 switch (sock->type) {
976 sock->ops = &unix_stream_ops;
979 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
983 sock->type = SOCK_DGRAM;
986 sock->ops = &unix_dgram_ops;
989 sock->ops = &unix_seqpacket_ops;
992 return -ESOCKTNOSUPPORT;
995 sk = unix_create1(net, sock, kern, sock->type);
1002 static int unix_release(struct socket *sock)
1004 struct sock *sk = sock->sk;
1009 sk->sk_prot->close(sk, 0);
1010 unix_release_sock(sk, 0);
1016 static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len,
1019 struct inode *inode;
1024 unix_mkname_bsd(sunaddr, addr_len);
1025 err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path);
1029 err = path_permission(&path, MAY_WRITE);
1033 err = -ECONNREFUSED;
1034 inode = d_backing_inode(path.dentry);
1035 if (!S_ISSOCK(inode->i_mode))
1038 sk = unix_find_socket_byinode(inode);
1043 if (sk->sk_type == type)
1057 return ERR_PTR(err);
1060 static struct sock *unix_find_abstract(struct net *net,
1061 struct sockaddr_un *sunaddr,
1062 int addr_len, int type)
1064 unsigned int hash = unix_abstract_hash(sunaddr, addr_len, type);
1065 struct dentry *dentry;
1068 sk = unix_find_socket_byname(net, sunaddr, addr_len, hash);
1070 return ERR_PTR(-ECONNREFUSED);
1072 dentry = unix_sk(sk)->path.dentry;
1074 touch_atime(&unix_sk(sk)->path);
1079 static struct sock *unix_find_other(struct net *net,
1080 struct sockaddr_un *sunaddr,
1081 int addr_len, int type)
1085 if (sunaddr->sun_path[0])
1086 sk = unix_find_bsd(sunaddr, addr_len, type);
1088 sk = unix_find_abstract(net, sunaddr, addr_len, type);
1093 static int unix_autobind(struct sock *sk)
1095 unsigned int new_hash, old_hash = sk->sk_hash;
1096 struct unix_sock *u = unix_sk(sk);
1097 struct net *net = sock_net(sk);
1098 struct unix_address *addr;
1099 u32 lastnum, ordernum;
1102 err = mutex_lock_interruptible(&u->bindlock);
1110 addr = kzalloc(sizeof(*addr) +
1111 offsetof(struct sockaddr_un, sun_path) + 16, GFP_KERNEL);
1115 addr->len = offsetof(struct sockaddr_un, sun_path) + 6;
1116 addr->name->sun_family = AF_UNIX;
1117 refcount_set(&addr->refcnt, 1);
1119 ordernum = prandom_u32();
1120 lastnum = ordernum & 0xFFFFF;
1122 ordernum = (ordernum + 1) & 0xFFFFF;
1123 sprintf(addr->name->sun_path + 1, "%05x", ordernum);
1125 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1126 unix_table_double_lock(net, old_hash, new_hash);
1128 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) {
1129 unix_table_double_unlock(net, old_hash, new_hash);
1131 /* __unix_find_socket_byname() may take long time if many names
1132 * are already in use.
1136 if (ordernum == lastnum) {
1137 /* Give up if all names seems to be in use. */
1139 unix_release_addr(addr);
1146 __unix_set_addr_hash(net, sk, addr, new_hash);
1147 unix_table_double_unlock(net, old_hash, new_hash);
1150 out: mutex_unlock(&u->bindlock);
1154 static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr,
1157 umode_t mode = S_IFSOCK |
1158 (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask());
1159 unsigned int new_hash, old_hash = sk->sk_hash;
1160 struct unix_sock *u = unix_sk(sk);
1161 struct net *net = sock_net(sk);
1162 struct user_namespace *ns; // barf...
1163 struct unix_address *addr;
1164 struct dentry *dentry;
1168 unix_mkname_bsd(sunaddr, addr_len);
1169 addr_len = strlen(sunaddr->sun_path) +
1170 offsetof(struct sockaddr_un, sun_path) + 1;
1172 addr = unix_create_addr(sunaddr, addr_len);
1177 * Get the parent directory, calculate the hash for last
1180 dentry = kern_path_create(AT_FDCWD, addr->name->sun_path, &parent, 0);
1181 if (IS_ERR(dentry)) {
1182 err = PTR_ERR(dentry);
1187 * All right, let's create it.
1189 ns = mnt_user_ns(parent.mnt);
1190 err = security_path_mknod(&parent, dentry, mode, 0);
1192 err = vfs_mknod(ns, d_inode(parent.dentry), dentry, mode, 0);
1195 err = mutex_lock_interruptible(&u->bindlock);
1201 new_hash = unix_bsd_hash(d_backing_inode(dentry));
1202 unix_table_double_lock(net, old_hash, new_hash);
1203 u->path.mnt = mntget(parent.mnt);
1204 u->path.dentry = dget(dentry);
1205 __unix_set_addr_hash(net, sk, addr, new_hash);
1206 unix_table_double_unlock(net, old_hash, new_hash);
1207 unix_insert_bsd_socket(sk);
1208 mutex_unlock(&u->bindlock);
1209 done_path_create(&parent, dentry);
1213 mutex_unlock(&u->bindlock);
1216 /* failed after successful mknod? unlink what we'd created... */
1217 vfs_unlink(ns, d_inode(parent.dentry), dentry, NULL);
1219 done_path_create(&parent, dentry);
1221 unix_release_addr(addr);
1222 return err == -EEXIST ? -EADDRINUSE : err;
1225 static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr,
1228 unsigned int new_hash, old_hash = sk->sk_hash;
1229 struct unix_sock *u = unix_sk(sk);
1230 struct net *net = sock_net(sk);
1231 struct unix_address *addr;
1234 addr = unix_create_addr(sunaddr, addr_len);
1238 err = mutex_lock_interruptible(&u->bindlock);
1247 new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type);
1248 unix_table_double_lock(net, old_hash, new_hash);
1250 if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash))
1253 __unix_set_addr_hash(net, sk, addr, new_hash);
1254 unix_table_double_unlock(net, old_hash, new_hash);
1255 mutex_unlock(&u->bindlock);
1259 unix_table_double_unlock(net, old_hash, new_hash);
1262 mutex_unlock(&u->bindlock);
1264 unix_release_addr(addr);
1268 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1270 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1271 struct sock *sk = sock->sk;
1274 if (addr_len == offsetof(struct sockaddr_un, sun_path) &&
1275 sunaddr->sun_family == AF_UNIX)
1276 return unix_autobind(sk);
1278 err = unix_validate_addr(sunaddr, addr_len);
1282 if (sunaddr->sun_path[0])
1283 err = unix_bind_bsd(sk, sunaddr, addr_len);
1285 err = unix_bind_abstract(sk, sunaddr, addr_len);
1290 static void unix_state_double_lock(struct sock *sk1, struct sock *sk2)
1292 if (unlikely(sk1 == sk2) || !sk2) {
1293 unix_state_lock(sk1);
1297 unix_state_lock(sk1);
1298 unix_state_lock_nested(sk2);
1300 unix_state_lock(sk2);
1301 unix_state_lock_nested(sk1);
1305 static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2)
1307 if (unlikely(sk1 == sk2) || !sk2) {
1308 unix_state_unlock(sk1);
1311 unix_state_unlock(sk1);
1312 unix_state_unlock(sk2);
1315 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
1316 int alen, int flags)
1318 struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr;
1319 struct sock *sk = sock->sk;
1324 if (alen < offsetofend(struct sockaddr, sa_family))
1327 if (addr->sa_family != AF_UNSPEC) {
1328 err = unix_validate_addr(sunaddr, alen);
1332 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
1333 !unix_sk(sk)->addr) {
1334 err = unix_autobind(sk);
1340 other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type);
1341 if (IS_ERR(other)) {
1342 err = PTR_ERR(other);
1346 unix_state_double_lock(sk, other);
1348 /* Apparently VFS overslept socket death. Retry. */
1349 if (sock_flag(other, SOCK_DEAD)) {
1350 unix_state_double_unlock(sk, other);
1356 if (!unix_may_send(sk, other))
1359 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1363 sk->sk_state = other->sk_state = TCP_ESTABLISHED;
1366 * 1003.1g breaking connected state with AF_UNSPEC
1369 unix_state_double_lock(sk, other);
1373 * If it was connected, reconnect.
1375 if (unix_peer(sk)) {
1376 struct sock *old_peer = unix_peer(sk);
1378 unix_peer(sk) = other;
1380 sk->sk_state = TCP_CLOSE;
1381 unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
1383 unix_state_double_unlock(sk, other);
1385 if (other != old_peer)
1386 unix_dgram_disconnected(sk, old_peer);
1389 unix_peer(sk) = other;
1390 unix_state_double_unlock(sk, other);
1396 unix_state_double_unlock(sk, other);
1402 static long unix_wait_for_peer(struct sock *other, long timeo)
1403 __releases(&unix_sk(other)->lock)
1405 struct unix_sock *u = unix_sk(other);
1409 prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
1411 sched = !sock_flag(other, SOCK_DEAD) &&
1412 !(other->sk_shutdown & RCV_SHUTDOWN) &&
1413 unix_recvq_full(other);
1415 unix_state_unlock(other);
1418 timeo = schedule_timeout(timeo);
1420 finish_wait(&u->peer_wait, &wait);
1424 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
1425 int addr_len, int flags)
1427 struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
1428 struct sock *sk = sock->sk, *newsk = NULL, *other = NULL;
1429 struct unix_sock *u = unix_sk(sk), *newu, *otheru;
1430 struct net *net = sock_net(sk);
1431 struct sk_buff *skb = NULL;
1436 err = unix_validate_addr(sunaddr, addr_len);
1440 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1441 err = unix_autobind(sk);
1446 timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
1448 /* First of all allocate resources.
1449 If we will make it after state is locked,
1450 we will have to recheck all again in any case.
1453 /* create new sock for complete connection */
1454 newsk = unix_create1(net, NULL, 0, sock->type);
1455 if (IS_ERR(newsk)) {
1456 err = PTR_ERR(newsk);
1463 /* Allocate skb for sending to listening sock */
1464 skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
1469 /* Find listening sock. */
1470 other = unix_find_other(net, sunaddr, addr_len, sk->sk_type);
1471 if (IS_ERR(other)) {
1472 err = PTR_ERR(other);
1477 /* Latch state of peer */
1478 unix_state_lock(other);
1480 /* Apparently VFS overslept socket death. Retry. */
1481 if (sock_flag(other, SOCK_DEAD)) {
1482 unix_state_unlock(other);
1487 err = -ECONNREFUSED;
1488 if (other->sk_state != TCP_LISTEN)
1490 if (other->sk_shutdown & RCV_SHUTDOWN)
1493 if (unix_recvq_full(other)) {
1498 timeo = unix_wait_for_peer(other, timeo);
1500 err = sock_intr_errno(timeo);
1501 if (signal_pending(current))
1509 It is tricky place. We need to grab our state lock and cannot
1510 drop lock on peer. It is dangerous because deadlock is
1511 possible. Connect to self case and simultaneous
1512 attempt to connect are eliminated by checking socket
1513 state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1514 check this before attempt to grab lock.
1516 Well, and we have to recheck the state after socket locked.
1522 /* This is ok... continue with connect */
1524 case TCP_ESTABLISHED:
1525 /* Socket is already connected */
1533 unix_state_lock_nested(sk);
1535 if (sk->sk_state != st) {
1536 unix_state_unlock(sk);
1537 unix_state_unlock(other);
1542 err = security_unix_stream_connect(sk, other, newsk);
1544 unix_state_unlock(sk);
1548 /* The way is open! Fastly set all the necessary fields... */
1551 unix_peer(newsk) = sk;
1552 newsk->sk_state = TCP_ESTABLISHED;
1553 newsk->sk_type = sk->sk_type;
1554 init_peercred(newsk);
1555 newu = unix_sk(newsk);
1556 RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
1557 otheru = unix_sk(other);
1559 /* copy address information from listening to new sock
1561 * The contents of *(otheru->addr) and otheru->path
1562 * are seen fully set up here, since we have found
1563 * otheru in hash under its lock. Insertion into the
1564 * hash chain we'd found it in had been done in an
1565 * earlier critical area protected by the chain's lock,
1566 * the same one where we'd set *(otheru->addr) contents,
1567 * as well as otheru->path and otheru->addr itself.
1569 * Using smp_store_release() here to set newu->addr
1570 * is enough to make those stores, as well as stores
1571 * to newu->path visible to anyone who gets newu->addr
1572 * by smp_load_acquire(). IOW, the same warranties
1573 * as for unix_sock instances bound in unix_bind() or
1574 * in unix_autobind().
1576 if (otheru->path.dentry) {
1577 path_get(&otheru->path);
1578 newu->path = otheru->path;
1580 refcount_inc(&otheru->addr->refcnt);
1581 smp_store_release(&newu->addr, otheru->addr);
1583 /* Set credentials */
1584 copy_peercred(sk, other);
1586 sock->state = SS_CONNECTED;
1587 sk->sk_state = TCP_ESTABLISHED;
1590 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1591 unix_peer(sk) = newsk;
1593 unix_state_unlock(sk);
1595 /* take ten and send info to listening sock */
1596 spin_lock(&other->sk_receive_queue.lock);
1597 __skb_queue_tail(&other->sk_receive_queue, skb);
1598 spin_unlock(&other->sk_receive_queue.lock);
1599 unix_state_unlock(other);
1600 other->sk_data_ready(other);
1606 unix_state_unlock(other);
1611 unix_release_sock(newsk, 0);
1617 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1619 struct sock *ska = socka->sk, *skb = sockb->sk;
1621 /* Join our sockets back to back */
1624 unix_peer(ska) = skb;
1625 unix_peer(skb) = ska;
1629 ska->sk_state = TCP_ESTABLISHED;
1630 skb->sk_state = TCP_ESTABLISHED;
1631 socka->state = SS_CONNECTED;
1632 sockb->state = SS_CONNECTED;
1636 static void unix_sock_inherit_flags(const struct socket *old,
1639 if (test_bit(SOCK_PASSCRED, &old->flags))
1640 set_bit(SOCK_PASSCRED, &new->flags);
1641 if (test_bit(SOCK_PASSSEC, &old->flags))
1642 set_bit(SOCK_PASSSEC, &new->flags);
1645 static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
1648 struct sock *sk = sock->sk;
1650 struct sk_buff *skb;
1654 if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET)
1658 if (sk->sk_state != TCP_LISTEN)
1661 /* If socket state is TCP_LISTEN it cannot change (for now...),
1662 * so that no locks are necessary.
1665 skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0,
1668 /* This means receive shutdown. */
1675 skb_free_datagram(sk, skb);
1676 wake_up_interruptible(&unix_sk(sk)->peer_wait);
1678 /* attach accepted sock to socket */
1679 unix_state_lock(tsk);
1680 newsock->state = SS_CONNECTED;
1681 unix_sock_inherit_flags(sock, newsock);
1682 sock_graft(tsk, newsock);
1683 unix_state_unlock(tsk);
1691 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int peer)
1693 struct sock *sk = sock->sk;
1694 struct unix_address *addr;
1695 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, uaddr);
1699 sk = unix_peer_get(sk);
1709 addr = smp_load_acquire(&unix_sk(sk)->addr);
1711 sunaddr->sun_family = AF_UNIX;
1712 sunaddr->sun_path[0] = 0;
1713 err = offsetof(struct sockaddr_un, sun_path);
1716 memcpy(sunaddr, addr->name, addr->len);
1723 static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
1725 scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1728 * Garbage collection of unix sockets starts by selecting a set of
1729 * candidate sockets which have reference only from being in flight
1730 * (total_refs == inflight_refs). This condition is checked once during
1731 * the candidate collection phase, and candidates are marked as such, so
1732 * that non-candidates can later be ignored. While inflight_refs is
1733 * protected by unix_gc_lock, total_refs (file count) is not, hence this
1734 * is an instantaneous decision.
1736 * Once a candidate, however, the socket must not be reinstalled into a
1737 * file descriptor while the garbage collection is in progress.
1739 * If the above conditions are met, then the directed graph of
1740 * candidates (*) does not change while unix_gc_lock is held.
1742 * Any operations that changes the file count through file descriptors
1743 * (dup, close, sendmsg) does not change the graph since candidates are
1744 * not installed in fds.
1746 * Dequeing a candidate via recvmsg would install it into an fd, but
1747 * that takes unix_gc_lock to decrement the inflight count, so it's
1748 * serialized with garbage collection.
1750 * MSG_PEEK is special in that it does not change the inflight count,
1751 * yet does install the socket into an fd. The following lock/unlock
1752 * pair is to ensure serialization with garbage collection. It must be
1753 * done between incrementing the file count and installing the file into
1756 * If garbage collection starts after the barrier provided by the
1757 * lock/unlock, then it will see the elevated refcount and not mark this
1758 * as a candidate. If a garbage collection is already in progress
1759 * before the file count was incremented, then the lock/unlock pair will
1760 * ensure that garbage collection is finished before progressing to
1761 * installing the fd.
1763 * (*) A -> B where B is on the queue of A or B is on the queue of C
1764 * which is on the queue of listening socket A.
1766 spin_lock(&unix_gc_lock);
1767 spin_unlock(&unix_gc_lock);
1770 static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
1774 UNIXCB(skb).pid = get_pid(scm->pid);
1775 UNIXCB(skb).uid = scm->creds.uid;
1776 UNIXCB(skb).gid = scm->creds.gid;
1777 UNIXCB(skb).fp = NULL;
1778 unix_get_secdata(scm, skb);
1779 if (scm->fp && send_fds)
1780 err = unix_attach_fds(scm, skb);
1782 skb->destructor = unix_destruct_scm;
1786 static bool unix_passcred_enabled(const struct socket *sock,
1787 const struct sock *other)
1789 return test_bit(SOCK_PASSCRED, &sock->flags) ||
1790 !other->sk_socket ||
1791 test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
1795 * Some apps rely on write() giving SCM_CREDENTIALS
1796 * We include credentials if source or destination socket
1797 * asserted SOCK_PASSCRED.
1799 static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
1800 const struct sock *other)
1802 if (UNIXCB(skb).pid)
1804 if (unix_passcred_enabled(sock, other)) {
1805 UNIXCB(skb).pid = get_pid(task_tgid(current));
1806 current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
1810 static int maybe_init_creds(struct scm_cookie *scm,
1811 struct socket *socket,
1812 const struct sock *other)
1815 struct msghdr msg = { .msg_controllen = 0 };
1817 err = scm_send(socket, &msg, scm, false);
1821 if (unix_passcred_enabled(socket, other)) {
1822 scm->pid = get_pid(task_tgid(current));
1823 current_uid_gid(&scm->creds.uid, &scm->creds.gid);
1828 static bool unix_skb_scm_eq(struct sk_buff *skb,
1829 struct scm_cookie *scm)
1831 return UNIXCB(skb).pid == scm->pid &&
1832 uid_eq(UNIXCB(skb).uid, scm->creds.uid) &&
1833 gid_eq(UNIXCB(skb).gid, scm->creds.gid) &&
1834 unix_secdata_eq(scm, skb);
1837 static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
1839 struct scm_fp_list *fp = UNIXCB(skb).fp;
1840 struct unix_sock *u = unix_sk(sk);
1842 if (unlikely(fp && fp->count))
1843 atomic_add(fp->count, &u->scm_stat.nr_fds);
1846 static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
1848 struct scm_fp_list *fp = UNIXCB(skb).fp;
1849 struct unix_sock *u = unix_sk(sk);
1851 if (unlikely(fp && fp->count))
1852 atomic_sub(fp->count, &u->scm_stat.nr_fds);
1856 * Send AF_UNIX data.
1859 static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
1862 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name);
1863 struct sock *sk = sock->sk, *other = NULL;
1864 struct unix_sock *u = unix_sk(sk);
1865 struct scm_cookie scm;
1866 struct sk_buff *skb;
1873 err = scm_send(sock, msg, &scm, false);
1878 if (msg->msg_flags&MSG_OOB)
1881 if (msg->msg_namelen) {
1882 err = unix_validate_addr(sunaddr, msg->msg_namelen);
1888 other = unix_peer_get(sk);
1893 if (test_bit(SOCK_PASSCRED, &sock->flags) && !u->addr) {
1894 err = unix_autobind(sk);
1900 if (len > sk->sk_sndbuf - 32)
1903 if (len > SKB_MAX_ALLOC) {
1904 data_len = min_t(size_t,
1905 len - SKB_MAX_ALLOC,
1906 MAX_SKB_FRAGS * PAGE_SIZE);
1907 data_len = PAGE_ALIGN(data_len);
1909 BUILD_BUG_ON(SKB_MAX_ALLOC < PAGE_SIZE);
1912 skb = sock_alloc_send_pskb(sk, len - data_len, data_len,
1913 msg->msg_flags & MSG_DONTWAIT, &err,
1914 PAGE_ALLOC_COSTLY_ORDER);
1918 err = unix_scm_to_skb(&scm, skb, true);
1922 skb_put(skb, len - data_len);
1923 skb->data_len = data_len;
1925 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, len);
1929 timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1934 if (sunaddr == NULL)
1937 other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen,
1939 if (IS_ERR(other)) {
1940 err = PTR_ERR(other);
1946 if (sk_filter(other, skb) < 0) {
1947 /* Toss the packet but do not return any error to the sender */
1953 unix_state_lock(other);
1956 if (!unix_may_send(sk, other))
1959 if (unlikely(sock_flag(other, SOCK_DEAD))) {
1961 * Check with 1003.1g - what should
1964 unix_state_unlock(other);
1968 unix_state_lock(sk);
1971 if (unix_peer(sk) == other) {
1972 unix_peer(sk) = NULL;
1973 unix_dgram_peer_wake_disconnect_wakeup(sk, other);
1975 unix_state_unlock(sk);
1977 sk->sk_state = TCP_CLOSE;
1978 unix_dgram_disconnected(sk, other);
1980 err = -ECONNREFUSED;
1982 unix_state_unlock(sk);
1992 if (other->sk_shutdown & RCV_SHUTDOWN)
1995 if (sk->sk_type != SOCK_SEQPACKET) {
1996 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
2001 /* other == sk && unix_peer(other) != sk if
2002 * - unix_peer(sk) == NULL, destination address bound to sk
2003 * - unix_peer(sk) == sk by time of get but disconnected before lock
2006 unlikely(unix_peer(other) != sk &&
2007 unix_recvq_full_lockless(other))) {
2009 timeo = unix_wait_for_peer(other, timeo);
2011 err = sock_intr_errno(timeo);
2012 if (signal_pending(current))
2019 unix_state_unlock(other);
2020 unix_state_double_lock(sk, other);
2023 if (unix_peer(sk) != other ||
2024 unix_dgram_peer_wake_me(sk, other)) {
2032 goto restart_locked;
2036 if (unlikely(sk_locked))
2037 unix_state_unlock(sk);
2039 if (sock_flag(other, SOCK_RCVTSTAMP))
2040 __net_timestamp(skb);
2041 maybe_add_creds(skb, sock, other);
2042 scm_stat_add(other, skb);
2043 skb_queue_tail(&other->sk_receive_queue, skb);
2044 unix_state_unlock(other);
2045 other->sk_data_ready(other);
2052 unix_state_unlock(sk);
2053 unix_state_unlock(other);
2063 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2064 * bytes, and a minimum of a full page.
2066 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2068 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2069 static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other)
2071 struct unix_sock *ousk = unix_sk(other);
2072 struct sk_buff *skb;
2075 skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err);
2081 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1);
2088 unix_state_lock(other);
2090 if (sock_flag(other, SOCK_DEAD) ||
2091 (other->sk_shutdown & RCV_SHUTDOWN)) {
2092 unix_state_unlock(other);
2097 maybe_add_creds(skb, sock, other);
2101 consume_skb(ousk->oob_skb);
2103 WRITE_ONCE(ousk->oob_skb, skb);
2105 scm_stat_add(other, skb);
2106 skb_queue_tail(&other->sk_receive_queue, skb);
2107 sk_send_sigurg(other);
2108 unix_state_unlock(other);
2109 other->sk_data_ready(other);
2115 static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg,
2118 struct sock *sk = sock->sk;
2119 struct sock *other = NULL;
2121 struct sk_buff *skb;
2123 struct scm_cookie scm;
2124 bool fds_sent = false;
2128 err = scm_send(sock, msg, &scm, false);
2133 if (msg->msg_flags & MSG_OOB) {
2134 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2142 if (msg->msg_namelen) {
2143 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
2147 other = unix_peer(sk);
2152 if (sk->sk_shutdown & SEND_SHUTDOWN)
2155 while (sent < len) {
2158 /* Keep two messages in the pipe so it schedules better */
2159 size = min_t(int, size, (sk->sk_sndbuf >> 1) - 64);
2161 /* allow fallback to order-0 allocations */
2162 size = min_t(int, size, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ);
2164 data_len = max_t(int, 0, size - SKB_MAX_HEAD(0));
2166 data_len = min_t(size_t, size, PAGE_ALIGN(data_len));
2168 skb = sock_alloc_send_pskb(sk, size - data_len, data_len,
2169 msg->msg_flags & MSG_DONTWAIT, &err,
2170 get_order(UNIX_SKB_FRAGS_SZ));
2174 /* Only send the fds in the first buffer */
2175 err = unix_scm_to_skb(&scm, skb, !fds_sent);
2182 skb_put(skb, size - data_len);
2183 skb->data_len = data_len;
2185 err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
2191 unix_state_lock(other);
2193 if (sock_flag(other, SOCK_DEAD) ||
2194 (other->sk_shutdown & RCV_SHUTDOWN))
2197 maybe_add_creds(skb, sock, other);
2198 scm_stat_add(other, skb);
2199 skb_queue_tail(&other->sk_receive_queue, skb);
2200 unix_state_unlock(other);
2201 other->sk_data_ready(other);
2205 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2206 if (msg->msg_flags & MSG_OOB) {
2207 err = queue_oob(sock, msg, other);
2219 unix_state_unlock(other);
2222 if (sent == 0 && !(msg->msg_flags&MSG_NOSIGNAL))
2223 send_sig(SIGPIPE, current, 0);
2227 return sent ? : err;
2230 static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
2231 int offset, size_t size, int flags)
2234 bool send_sigpipe = false;
2235 bool init_scm = true;
2236 struct scm_cookie scm;
2237 struct sock *other, *sk = socket->sk;
2238 struct sk_buff *skb, *newskb = NULL, *tail = NULL;
2240 if (flags & MSG_OOB)
2243 other = unix_peer(sk);
2244 if (!other || sk->sk_state != TCP_ESTABLISHED)
2249 unix_state_unlock(other);
2250 mutex_unlock(&unix_sk(other)->iolock);
2251 newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
2257 /* we must acquire iolock as we modify already present
2258 * skbs in the sk_receive_queue and mess with skb->len
2260 err = mutex_lock_interruptible(&unix_sk(other)->iolock);
2262 err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
2266 if (sk->sk_shutdown & SEND_SHUTDOWN) {
2268 send_sigpipe = true;
2272 unix_state_lock(other);
2274 if (sock_flag(other, SOCK_DEAD) ||
2275 other->sk_shutdown & RCV_SHUTDOWN) {
2277 send_sigpipe = true;
2278 goto err_state_unlock;
2282 err = maybe_init_creds(&scm, socket, other);
2284 goto err_state_unlock;
2288 skb = skb_peek_tail(&other->sk_receive_queue);
2289 if (tail && tail == skb) {
2291 } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
2298 } else if (newskb) {
2299 /* this is fast path, we don't necessarily need to
2300 * call to kfree_skb even though with newskb == NULL
2301 * this - does no harm
2303 consume_skb(newskb);
2307 if (skb_append_pagefrags(skb, page, offset, size)) {
2313 skb->data_len += size;
2314 skb->truesize += size;
2315 refcount_add(size, &sk->sk_wmem_alloc);
2318 err = unix_scm_to_skb(&scm, skb, false);
2320 goto err_state_unlock;
2321 spin_lock(&other->sk_receive_queue.lock);
2322 __skb_queue_tail(&other->sk_receive_queue, newskb);
2323 spin_unlock(&other->sk_receive_queue.lock);
2326 unix_state_unlock(other);
2327 mutex_unlock(&unix_sk(other)->iolock);
2329 other->sk_data_ready(other);
2334 unix_state_unlock(other);
2336 mutex_unlock(&unix_sk(other)->iolock);
2339 if (send_sigpipe && !(flags & MSG_NOSIGNAL))
2340 send_sig(SIGPIPE, current, 0);
2346 static int unix_seqpacket_sendmsg(struct socket *sock, struct msghdr *msg,
2350 struct sock *sk = sock->sk;
2352 err = sock_error(sk);
2356 if (sk->sk_state != TCP_ESTABLISHED)
2359 if (msg->msg_namelen)
2360 msg->msg_namelen = 0;
2362 return unix_dgram_sendmsg(sock, msg, len);
2365 static int unix_seqpacket_recvmsg(struct socket *sock, struct msghdr *msg,
2366 size_t size, int flags)
2368 struct sock *sk = sock->sk;
2370 if (sk->sk_state != TCP_ESTABLISHED)
2373 return unix_dgram_recvmsg(sock, msg, size, flags);
2376 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
2378 struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
2381 msg->msg_namelen = addr->len;
2382 memcpy(msg->msg_name, addr->name, addr->len);
2386 int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size,
2389 struct scm_cookie scm;
2390 struct socket *sock = sk->sk_socket;
2391 struct unix_sock *u = unix_sk(sk);
2392 struct sk_buff *skb, *last;
2401 timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
2404 mutex_lock(&u->iolock);
2406 skip = sk_peek_offset(sk, flags);
2407 skb = __skb_try_recv_datagram(sk, &sk->sk_receive_queue, flags,
2408 &skip, &err, &last);
2410 if (!(flags & MSG_PEEK))
2411 scm_stat_del(sk, skb);
2415 mutex_unlock(&u->iolock);
2420 !__skb_wait_for_more_packets(sk, &sk->sk_receive_queue,
2421 &err, &timeo, last));
2423 if (!skb) { /* implies iolock unlocked */
2424 unix_state_lock(sk);
2425 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2426 if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
2427 (sk->sk_shutdown & RCV_SHUTDOWN))
2429 unix_state_unlock(sk);
2433 if (wq_has_sleeper(&u->peer_wait))
2434 wake_up_interruptible_sync_poll(&u->peer_wait,
2435 EPOLLOUT | EPOLLWRNORM |
2439 unix_copy_addr(msg, skb->sk);
2441 if (size > skb->len - skip)
2442 size = skb->len - skip;
2443 else if (size < skb->len - skip)
2444 msg->msg_flags |= MSG_TRUNC;
2446 err = skb_copy_datagram_msg(skb, skip, msg, size);
2450 if (sock_flag(sk, SOCK_RCVTSTAMP))
2451 __sock_recv_timestamp(msg, sk, skb);
2453 memset(&scm, 0, sizeof(scm));
2455 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2456 unix_set_secdata(&scm, skb);
2458 if (!(flags & MSG_PEEK)) {
2460 unix_detach_fds(&scm, skb);
2462 sk_peek_offset_bwd(sk, skb->len);
2464 /* It is questionable: on PEEK we could:
2465 - do not return fds - good, but too simple 8)
2466 - return fds, and do not return them on read (old strategy,
2468 - clone fds (I chose it for now, it is the most universal
2471 POSIX 1003.1g does not actually define this clearly
2472 at all. POSIX 1003.1g doesn't define a lot of things
2477 sk_peek_offset_fwd(sk, size);
2480 unix_peek_fds(&scm, skb);
2482 err = (flags & MSG_TRUNC) ? skb->len - skip : size;
2484 scm_recv(sock, msg, &scm, flags);
2487 skb_free_datagram(sk, skb);
2488 mutex_unlock(&u->iolock);
2493 static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
2496 struct sock *sk = sock->sk;
2498 #ifdef CONFIG_BPF_SYSCALL
2499 const struct proto *prot = READ_ONCE(sk->sk_prot);
2501 if (prot != &unix_dgram_proto)
2502 return prot->recvmsg(sk, msg, size, flags, NULL);
2504 return __unix_dgram_recvmsg(sk, msg, size, flags);
2507 static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2512 struct unix_sock *u = unix_sk(sk);
2513 struct sk_buff *skb;
2516 mutex_lock(&u->iolock);
2517 skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err);
2518 mutex_unlock(&u->iolock);
2522 used = recv_actor(sk, skb);
2528 } else if (used <= skb->len) {
2540 * Sleep until more data has arrived. But check for races..
2542 static long unix_stream_data_wait(struct sock *sk, long timeo,
2543 struct sk_buff *last, unsigned int last_len,
2546 struct sk_buff *tail;
2549 unix_state_lock(sk);
2552 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2554 tail = skb_peek_tail(&sk->sk_receive_queue);
2556 (tail && tail->len != last_len) ||
2558 (sk->sk_shutdown & RCV_SHUTDOWN) ||
2559 signal_pending(current) ||
2563 sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2564 unix_state_unlock(sk);
2566 timeo = freezable_schedule_timeout(timeo);
2568 timeo = schedule_timeout(timeo);
2569 unix_state_lock(sk);
2571 if (sock_flag(sk, SOCK_DEAD))
2574 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2577 finish_wait(sk_sleep(sk), &wait);
2578 unix_state_unlock(sk);
2582 static unsigned int unix_skb_len(const struct sk_buff *skb)
2584 return skb->len - UNIXCB(skb).consumed;
2587 struct unix_stream_read_state {
2588 int (*recv_actor)(struct sk_buff *, int, int,
2589 struct unix_stream_read_state *);
2590 struct socket *socket;
2592 struct pipe_inode_info *pipe;
2595 unsigned int splice_flags;
2598 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2599 static int unix_stream_recv_urg(struct unix_stream_read_state *state)
2601 struct socket *sock = state->socket;
2602 struct sock *sk = sock->sk;
2603 struct unix_sock *u = unix_sk(sk);
2605 struct sk_buff *oob_skb;
2607 mutex_lock(&u->iolock);
2608 unix_state_lock(sk);
2610 if (sock_flag(sk, SOCK_URGINLINE) || !u->oob_skb) {
2611 unix_state_unlock(sk);
2612 mutex_unlock(&u->iolock);
2616 oob_skb = u->oob_skb;
2618 if (!(state->flags & MSG_PEEK))
2619 WRITE_ONCE(u->oob_skb, NULL);
2621 unix_state_unlock(sk);
2623 chunk = state->recv_actor(oob_skb, 0, chunk, state);
2625 if (!(state->flags & MSG_PEEK)) {
2626 UNIXCB(oob_skb).consumed += 1;
2630 mutex_unlock(&u->iolock);
2635 state->msg->msg_flags |= MSG_OOB;
2639 static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk,
2640 int flags, int copied)
2642 struct unix_sock *u = unix_sk(sk);
2644 if (!unix_skb_len(skb) && !(flags & MSG_PEEK)) {
2645 skb_unlink(skb, &sk->sk_receive_queue);
2649 if (skb == u->oob_skb) {
2652 } else if (sock_flag(sk, SOCK_URGINLINE)) {
2653 if (!(flags & MSG_PEEK)) {
2654 WRITE_ONCE(u->oob_skb, NULL);
2657 } else if (!(flags & MSG_PEEK)) {
2658 skb_unlink(skb, &sk->sk_receive_queue);
2660 skb = skb_peek(&sk->sk_receive_queue);
2668 static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor)
2670 if (unlikely(sk->sk_state != TCP_ESTABLISHED))
2673 return unix_read_skb(sk, recv_actor);
2676 static int unix_stream_read_generic(struct unix_stream_read_state *state,
2679 struct scm_cookie scm;
2680 struct socket *sock = state->socket;
2681 struct sock *sk = sock->sk;
2682 struct unix_sock *u = unix_sk(sk);
2684 int flags = state->flags;
2685 int noblock = flags & MSG_DONTWAIT;
2686 bool check_creds = false;
2691 size_t size = state->size;
2692 unsigned int last_len;
2694 if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
2699 if (unlikely(flags & MSG_OOB)) {
2701 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2702 err = unix_stream_recv_urg(state);
2707 target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
2708 timeo = sock_rcvtimeo(sk, noblock);
2710 memset(&scm, 0, sizeof(scm));
2712 /* Lock the socket to prevent queue disordering
2713 * while sleeps in memcpy_tomsg
2715 mutex_lock(&u->iolock);
2717 skip = max(sk_peek_offset(sk, flags), 0);
2722 struct sk_buff *skb, *last;
2725 unix_state_lock(sk);
2726 if (sock_flag(sk, SOCK_DEAD)) {
2730 last = skb = skb_peek(&sk->sk_receive_queue);
2731 last_len = last ? last->len : 0;
2733 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2735 skb = manage_oob(skb, sk, flags, copied);
2737 unix_state_unlock(sk);
2746 if (copied >= target)
2750 * POSIX 1003.1g mandates this order.
2753 err = sock_error(sk);
2756 if (sk->sk_shutdown & RCV_SHUTDOWN)
2759 unix_state_unlock(sk);
2765 mutex_unlock(&u->iolock);
2767 timeo = unix_stream_data_wait(sk, timeo, last,
2768 last_len, freezable);
2770 if (signal_pending(current)) {
2771 err = sock_intr_errno(timeo);
2776 mutex_lock(&u->iolock);
2779 unix_state_unlock(sk);
2783 while (skip >= unix_skb_len(skb)) {
2784 skip -= unix_skb_len(skb);
2786 last_len = skb->len;
2787 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2792 unix_state_unlock(sk);
2795 /* Never glue messages from different writers */
2796 if (!unix_skb_scm_eq(skb, &scm))
2798 } else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
2799 /* Copy credentials */
2800 scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid);
2801 unix_set_secdata(&scm, skb);
2805 /* Copy address just once */
2806 if (state->msg && state->msg->msg_name) {
2807 DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr,
2808 state->msg->msg_name);
2809 unix_copy_addr(state->msg, skb->sk);
2813 chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
2815 chunk = state->recv_actor(skb, skip, chunk, state);
2816 drop_skb = !unix_skb_len(skb);
2817 /* skb is only safe to use if !drop_skb */
2828 /* the skb was touched by a concurrent reader;
2829 * we should not expect anything from this skb
2830 * anymore and assume it invalid - we can be
2831 * sure it was dropped from the socket queue
2833 * let's report a short read
2839 /* Mark read part of skb as used */
2840 if (!(flags & MSG_PEEK)) {
2841 UNIXCB(skb).consumed += chunk;
2843 sk_peek_offset_bwd(sk, chunk);
2845 if (UNIXCB(skb).fp) {
2846 scm_stat_del(sk, skb);
2847 unix_detach_fds(&scm, skb);
2850 if (unix_skb_len(skb))
2853 skb_unlink(skb, &sk->sk_receive_queue);
2859 /* It is questionable, see note in unix_dgram_recvmsg.
2862 unix_peek_fds(&scm, skb);
2864 sk_peek_offset_fwd(sk, chunk);
2871 last_len = skb->len;
2872 unix_state_lock(sk);
2873 skb = skb_peek_next(skb, &sk->sk_receive_queue);
2876 unix_state_unlock(sk);
2881 mutex_unlock(&u->iolock);
2883 scm_recv(sock, state->msg, &scm, flags);
2887 return copied ? : err;
2890 static int unix_stream_read_actor(struct sk_buff *skb,
2891 int skip, int chunk,
2892 struct unix_stream_read_state *state)
2896 ret = skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
2898 return ret ?: chunk;
2901 int __unix_stream_recvmsg(struct sock *sk, struct msghdr *msg,
2902 size_t size, int flags)
2904 struct unix_stream_read_state state = {
2905 .recv_actor = unix_stream_read_actor,
2906 .socket = sk->sk_socket,
2912 return unix_stream_read_generic(&state, true);
2915 static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg,
2916 size_t size, int flags)
2918 struct unix_stream_read_state state = {
2919 .recv_actor = unix_stream_read_actor,
2926 #ifdef CONFIG_BPF_SYSCALL
2927 struct sock *sk = sock->sk;
2928 const struct proto *prot = READ_ONCE(sk->sk_prot);
2930 if (prot != &unix_stream_proto)
2931 return prot->recvmsg(sk, msg, size, flags, NULL);
2933 return unix_stream_read_generic(&state, true);
2936 static int unix_stream_splice_actor(struct sk_buff *skb,
2937 int skip, int chunk,
2938 struct unix_stream_read_state *state)
2940 return skb_splice_bits(skb, state->socket->sk,
2941 UNIXCB(skb).consumed + skip,
2942 state->pipe, chunk, state->splice_flags);
2945 static ssize_t unix_stream_splice_read(struct socket *sock, loff_t *ppos,
2946 struct pipe_inode_info *pipe,
2947 size_t size, unsigned int flags)
2949 struct unix_stream_read_state state = {
2950 .recv_actor = unix_stream_splice_actor,
2954 .splice_flags = flags,
2957 if (unlikely(*ppos))
2960 if (sock->file->f_flags & O_NONBLOCK ||
2961 flags & SPLICE_F_NONBLOCK)
2962 state.flags = MSG_DONTWAIT;
2964 return unix_stream_read_generic(&state, false);
2967 static int unix_shutdown(struct socket *sock, int mode)
2969 struct sock *sk = sock->sk;
2972 if (mode < SHUT_RD || mode > SHUT_RDWR)
2975 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
2976 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
2977 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
2981 unix_state_lock(sk);
2982 sk->sk_shutdown |= mode;
2983 other = unix_peer(sk);
2986 unix_state_unlock(sk);
2987 sk->sk_state_change(sk);
2990 (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
2993 const struct proto *prot = READ_ONCE(other->sk_prot);
2996 prot->unhash(other);
2997 if (mode&RCV_SHUTDOWN)
2998 peer_mode |= SEND_SHUTDOWN;
2999 if (mode&SEND_SHUTDOWN)
3000 peer_mode |= RCV_SHUTDOWN;
3001 unix_state_lock(other);
3002 other->sk_shutdown |= peer_mode;
3003 unix_state_unlock(other);
3004 other->sk_state_change(other);
3005 if (peer_mode == SHUTDOWN_MASK)
3006 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP);
3007 else if (peer_mode & RCV_SHUTDOWN)
3008 sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN);
3016 long unix_inq_len(struct sock *sk)
3018 struct sk_buff *skb;
3021 if (sk->sk_state == TCP_LISTEN)
3024 spin_lock(&sk->sk_receive_queue.lock);
3025 if (sk->sk_type == SOCK_STREAM ||
3026 sk->sk_type == SOCK_SEQPACKET) {
3027 skb_queue_walk(&sk->sk_receive_queue, skb)
3028 amount += unix_skb_len(skb);
3030 skb = skb_peek(&sk->sk_receive_queue);
3034 spin_unlock(&sk->sk_receive_queue.lock);
3038 EXPORT_SYMBOL_GPL(unix_inq_len);
3040 long unix_outq_len(struct sock *sk)
3042 return sk_wmem_alloc_get(sk);
3044 EXPORT_SYMBOL_GPL(unix_outq_len);
3046 static int unix_open_file(struct sock *sk)
3052 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3055 if (!smp_load_acquire(&unix_sk(sk)->addr))
3058 path = unix_sk(sk)->path;
3064 fd = get_unused_fd_flags(O_CLOEXEC);
3068 f = dentry_open(&path, O_PATH, current_cred());
3082 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3084 struct sock *sk = sock->sk;
3090 amount = unix_outq_len(sk);
3091 err = put_user(amount, (int __user *)arg);
3094 amount = unix_inq_len(sk);
3098 err = put_user(amount, (int __user *)arg);
3101 err = unix_open_file(sk);
3103 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3106 struct sk_buff *skb;
3109 skb = skb_peek(&sk->sk_receive_queue);
3110 if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb))
3112 err = put_user(answ, (int __user *)arg);
3123 #ifdef CONFIG_COMPAT
3124 static int unix_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3126 return unix_ioctl(sock, cmd, (unsigned long)compat_ptr(arg));
3130 static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wait)
3132 struct sock *sk = sock->sk;
3135 sock_poll_wait(file, sock, wait);
3138 /* exceptional events? */
3141 if (sk->sk_shutdown == SHUTDOWN_MASK)
3143 if (sk->sk_shutdown & RCV_SHUTDOWN)
3144 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3147 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3148 mask |= EPOLLIN | EPOLLRDNORM;
3149 if (sk_is_readable(sk))
3150 mask |= EPOLLIN | EPOLLRDNORM;
3151 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3152 if (READ_ONCE(unix_sk(sk)->oob_skb))
3156 /* Connection-based need to check for termination and startup */
3157 if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) &&
3158 sk->sk_state == TCP_CLOSE)
3162 * we set writable also when the other side has shut down the
3163 * connection. This prevents stuck sockets.
3165 if (unix_writable(sk))
3166 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3171 static __poll_t unix_dgram_poll(struct file *file, struct socket *sock,
3174 struct sock *sk = sock->sk, *other;
3175 unsigned int writable;
3178 sock_poll_wait(file, sock, wait);
3181 /* exceptional events? */
3182 if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue))
3184 (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0);
3186 if (sk->sk_shutdown & RCV_SHUTDOWN)
3187 mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM;
3188 if (sk->sk_shutdown == SHUTDOWN_MASK)
3192 if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
3193 mask |= EPOLLIN | EPOLLRDNORM;
3194 if (sk_is_readable(sk))
3195 mask |= EPOLLIN | EPOLLRDNORM;
3197 /* Connection-based need to check for termination and startup */
3198 if (sk->sk_type == SOCK_SEQPACKET) {
3199 if (sk->sk_state == TCP_CLOSE)
3201 /* connection hasn't started yet? */
3202 if (sk->sk_state == TCP_SYN_SENT)
3206 /* No write status requested, avoid expensive OUT tests. */
3207 if (!(poll_requested_events(wait) & (EPOLLWRBAND|EPOLLWRNORM|EPOLLOUT)))
3210 writable = unix_writable(sk);
3212 unix_state_lock(sk);
3214 other = unix_peer(sk);
3215 if (other && unix_peer(other) != sk &&
3216 unix_recvq_full_lockless(other) &&
3217 unix_dgram_peer_wake_me(sk, other))
3220 unix_state_unlock(sk);
3224 mask |= EPOLLOUT | EPOLLWRNORM | EPOLLWRBAND;
3226 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
3231 #ifdef CONFIG_PROC_FS
3233 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3235 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3236 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3237 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3239 static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos)
3241 unsigned long offset = get_offset(*pos);
3242 unsigned long bucket = get_bucket(*pos);
3243 unsigned long count = 0;
3246 for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]);
3247 sk; sk = sk_next(sk)) {
3248 if (++count == offset)
3255 static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos)
3257 unsigned long bucket = get_bucket(*pos);
3258 struct net *net = seq_file_net(seq);
3261 while (bucket < UNIX_HASH_SIZE) {
3262 spin_lock(&net->unx.table.locks[bucket]);
3264 sk = unix_from_bucket(seq, pos);
3268 spin_unlock(&net->unx.table.locks[bucket]);
3270 *pos = set_bucket_offset(++bucket, 1);
3276 static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk,
3279 unsigned long bucket = get_bucket(*pos);
3286 spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]);
3288 *pos = set_bucket_offset(++bucket, 1);
3290 return unix_get_first(seq, pos);
3293 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
3296 return SEQ_START_TOKEN;
3298 return unix_get_first(seq, pos);
3301 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3305 if (v == SEQ_START_TOKEN)
3306 return unix_get_first(seq, pos);
3308 return unix_get_next(seq, v, pos);
3311 static void unix_seq_stop(struct seq_file *seq, void *v)
3313 struct sock *sk = v;
3316 spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]);
3319 static int unix_seq_show(struct seq_file *seq, void *v)
3322 if (v == SEQ_START_TOKEN)
3323 seq_puts(seq, "Num RefCount Protocol Flags Type St "
3327 struct unix_sock *u = unix_sk(s);
3330 seq_printf(seq, "%pK: %08X %08X %08X %04X %02X %5lu",
3332 refcount_read(&s->sk_refcnt),
3334 s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
3337 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
3338 (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
3341 if (u->addr) { // under a hash table lock here
3346 len = u->addr->len -
3347 offsetof(struct sockaddr_un, sun_path);
3348 if (u->addr->name->sun_path[0]) {
3354 for ( ; i < len; i++)
3355 seq_putc(seq, u->addr->name->sun_path[i] ?:
3358 unix_state_unlock(s);
3359 seq_putc(seq, '\n');
3365 static const struct seq_operations unix_seq_ops = {
3366 .start = unix_seq_start,
3367 .next = unix_seq_next,
3368 .stop = unix_seq_stop,
3369 .show = unix_seq_show,
3372 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL)
3373 struct bpf_unix_iter_state {
3374 struct seq_net_private p;
3375 unsigned int cur_sk;
3376 unsigned int end_sk;
3377 unsigned int max_sk;
3378 struct sock **batch;
3379 bool st_bucket_done;
3382 struct bpf_iter__unix {
3383 __bpf_md_ptr(struct bpf_iter_meta *, meta);
3384 __bpf_md_ptr(struct unix_sock *, unix_sk);
3385 uid_t uid __aligned(8);
3388 static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta,
3389 struct unix_sock *unix_sk, uid_t uid)
3391 struct bpf_iter__unix ctx;
3393 meta->seq_num--; /* skip SEQ_START_TOKEN */
3395 ctx.unix_sk = unix_sk;
3397 return bpf_iter_run_prog(prog, &ctx);
3400 static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk)
3403 struct bpf_unix_iter_state *iter = seq->private;
3404 unsigned int expected = 1;
3407 sock_hold(start_sk);
3408 iter->batch[iter->end_sk++] = start_sk;
3410 for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) {
3411 if (iter->end_sk < iter->max_sk) {
3413 iter->batch[iter->end_sk++] = sk;
3419 spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]);
3424 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter)
3426 while (iter->cur_sk < iter->end_sk)
3427 sock_put(iter->batch[iter->cur_sk++]);
3430 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter,
3431 unsigned int new_batch_sz)
3433 struct sock **new_batch;
3435 new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz,
3436 GFP_USER | __GFP_NOWARN);
3440 bpf_iter_unix_put_batch(iter);
3441 kvfree(iter->batch);
3442 iter->batch = new_batch;
3443 iter->max_sk = new_batch_sz;
3448 static struct sock *bpf_iter_unix_batch(struct seq_file *seq,
3451 struct bpf_unix_iter_state *iter = seq->private;
3452 unsigned int expected;
3453 bool resized = false;
3456 if (iter->st_bucket_done)
3457 *pos = set_bucket_offset(get_bucket(*pos) + 1, 1);
3460 /* Get a new batch */
3464 sk = unix_get_first(seq, pos);
3466 return NULL; /* Done */
3468 expected = bpf_iter_unix_hold_batch(seq, sk);
3470 if (iter->end_sk == expected) {
3471 iter->st_bucket_done = true;
3475 if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) {
3483 static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos)
3486 return SEQ_START_TOKEN;
3488 /* bpf iter does not support lseek, so it always
3489 * continue from where it was stop()-ped.
3491 return bpf_iter_unix_batch(seq, pos);
3494 static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3496 struct bpf_unix_iter_state *iter = seq->private;
3499 /* Whenever seq_next() is called, the iter->cur_sk is
3500 * done with seq_show(), so advance to the next sk in
3503 if (iter->cur_sk < iter->end_sk)
3504 sock_put(iter->batch[iter->cur_sk++]);
3508 if (iter->cur_sk < iter->end_sk)
3509 sk = iter->batch[iter->cur_sk];
3511 sk = bpf_iter_unix_batch(seq, pos);
3516 static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v)
3518 struct bpf_iter_meta meta;
3519 struct bpf_prog *prog;
3520 struct sock *sk = v;
3525 if (v == SEQ_START_TOKEN)
3528 slow = lock_sock_fast(sk);
3530 if (unlikely(sk_unhashed(sk))) {
3535 uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk));
3537 prog = bpf_iter_get_info(&meta, false);
3538 ret = unix_prog_seq_show(prog, &meta, v, uid);
3540 unlock_sock_fast(sk, slow);
3544 static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v)
3546 struct bpf_unix_iter_state *iter = seq->private;
3547 struct bpf_iter_meta meta;
3548 struct bpf_prog *prog;
3552 prog = bpf_iter_get_info(&meta, true);
3554 (void)unix_prog_seq_show(prog, &meta, v, 0);
3557 if (iter->cur_sk < iter->end_sk)
3558 bpf_iter_unix_put_batch(iter);
3561 static const struct seq_operations bpf_iter_unix_seq_ops = {
3562 .start = bpf_iter_unix_seq_start,
3563 .next = bpf_iter_unix_seq_next,
3564 .stop = bpf_iter_unix_seq_stop,
3565 .show = bpf_iter_unix_seq_show,
3570 static const struct net_proto_family unix_family_ops = {
3572 .create = unix_create,
3573 .owner = THIS_MODULE,
3577 static int __net_init unix_net_init(struct net *net)
3581 net->unx.sysctl_max_dgram_qlen = 10;
3582 if (unix_sysctl_register(net))
3585 #ifdef CONFIG_PROC_FS
3586 if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops,
3587 sizeof(struct seq_net_private)))
3591 net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE,
3592 sizeof(spinlock_t), GFP_KERNEL);
3593 if (!net->unx.table.locks)
3596 net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE,
3597 sizeof(struct hlist_head),
3599 if (!net->unx.table.buckets)
3602 for (i = 0; i < UNIX_HASH_SIZE; i++) {
3603 spin_lock_init(&net->unx.table.locks[i]);
3604 INIT_HLIST_HEAD(&net->unx.table.buckets[i]);
3610 kvfree(net->unx.table.locks);
3612 #ifdef CONFIG_PROC_FS
3613 remove_proc_entry("unix", net->proc_net);
3616 unix_sysctl_unregister(net);
3621 static void __net_exit unix_net_exit(struct net *net)
3623 kvfree(net->unx.table.buckets);
3624 kvfree(net->unx.table.locks);
3625 unix_sysctl_unregister(net);
3626 remove_proc_entry("unix", net->proc_net);
3629 static struct pernet_operations unix_net_ops = {
3630 .init = unix_net_init,
3631 .exit = unix_net_exit,
3634 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3635 DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta,
3636 struct unix_sock *unix_sk, uid_t uid)
3638 #define INIT_BATCH_SZ 16
3640 static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux)
3642 struct bpf_unix_iter_state *iter = priv_data;
3645 err = bpf_iter_init_seq_net(priv_data, aux);
3649 err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ);
3651 bpf_iter_fini_seq_net(priv_data);
3658 static void bpf_iter_fini_unix(void *priv_data)
3660 struct bpf_unix_iter_state *iter = priv_data;
3662 bpf_iter_fini_seq_net(priv_data);
3663 kvfree(iter->batch);
3666 static const struct bpf_iter_seq_info unix_seq_info = {
3667 .seq_ops = &bpf_iter_unix_seq_ops,
3668 .init_seq_private = bpf_iter_init_unix,
3669 .fini_seq_private = bpf_iter_fini_unix,
3670 .seq_priv_size = sizeof(struct bpf_unix_iter_state),
3673 static const struct bpf_func_proto *
3674 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id,
3675 const struct bpf_prog *prog)
3678 case BPF_FUNC_setsockopt:
3679 return &bpf_sk_setsockopt_proto;
3680 case BPF_FUNC_getsockopt:
3681 return &bpf_sk_getsockopt_proto;
3687 static struct bpf_iter_reg unix_reg_info = {
3689 .ctx_arg_info_size = 1,
3691 { offsetof(struct bpf_iter__unix, unix_sk),
3692 PTR_TO_BTF_ID_OR_NULL },
3694 .get_func_proto = bpf_iter_unix_get_func_proto,
3695 .seq_info = &unix_seq_info,
3698 static void __init bpf_iter_register(void)
3700 unix_reg_info.ctx_arg_info[0].btf_id = btf_sock_ids[BTF_SOCK_TYPE_UNIX];
3701 if (bpf_iter_reg_target(&unix_reg_info))
3702 pr_warn("Warning: could not register bpf iterator unix\n");
3706 static int __init af_unix_init(void)
3710 BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb));
3712 for (i = 0; i < UNIX_HASH_SIZE / 2; i++) {
3713 spin_lock_init(&bsd_socket_locks[i]);
3714 INIT_HLIST_HEAD(&bsd_socket_buckets[i]);
3717 rc = proto_register(&unix_dgram_proto, 1);
3719 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3723 rc = proto_register(&unix_stream_proto, 1);
3725 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__);
3729 sock_register(&unix_family_ops);
3730 register_pernet_subsys(&unix_net_ops);
3731 unix_bpf_build_proto();
3733 #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3734 bpf_iter_register();
3741 static void __exit af_unix_exit(void)
3743 sock_unregister(PF_UNIX);
3744 proto_unregister(&unix_dgram_proto);
3745 proto_unregister(&unix_stream_proto);
3746 unregister_pernet_subsys(&unix_net_ops);
3749 /* Earlier than device_initcall() so that other drivers invoking
3750 request_module() don't end up in a loop when modprobe tries
3751 to use a UNIX socket. But later than subsys_initcall() because
3752 we depend on stuff initialised there */
3753 fs_initcall(af_unix_init);
3754 module_exit(af_unix_exit);
3756 MODULE_LICENSE("GPL");
3757 MODULE_ALIAS_NETPROTO(PF_UNIX);