net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117
 118 #include <linux/uaccess.h>
 119
 120 #include <linux/netdevice.h>
 121 #include <net/protocol.h>
 122 #include <linux/skbuff.h>
 123 #include <net/net_namespace.h>
 124 #include <net/request_sock.h>
 125 #include <net/sock.h>
 126 #include <linux/net_tstamp.h>
 127 #include <net/xfrm.h>
 128 #include <linux/ipsec.h>
 129 #include <net/cls_cgroup.h>
 130 #include <net/netprio_cgroup.h>
 131 #include <linux/sock_diag.h>
 132
 133 #include <linux/filter.h>
 134 #include <net/sock_reuseport.h>
 135 #include <net/bpf_sk_storage.h>
 136
 137 #include <trace/events/sock.h>
 138
 139 #include <net/tcp.h>
 140 #include <net/busy_poll.h>
 141
 142 #include <linux/ethtool.h>
 143
 144 static DEFINE_MUTEX(proto_list_mutex);
 145 static LIST_HEAD(proto_list);
 146
 147 /**
 148  * sk_ns_capable - General socket capability test
 149  * @sk: Socket to use a capability on or through
 150  * @user_ns: The user namespace of the capability to use
 151  * @cap: The capability to use
 152  *
 153  * Test to see if the opener of the socket had when the socket was
 154  * created and the current process has the capability @cap in the user
 155  * namespace @user_ns.
 156  */
 157 bool sk_ns_capable(const struct sock *sk,
 158                    struct user_namespace *user_ns, int cap)
 159 {
 160         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 161                 ns_capable(user_ns, cap);
 162 }
 163 EXPORT_SYMBOL(sk_ns_capable);
 164
 165 /**
 166  * sk_capable - Socket global capability test
 167  * @sk: Socket to use a capability on or through
 168  * @cap: The global capability to use
 169  *
 170  * Test to see if the opener of the socket had when the socket was
 171  * created and the current process has the capability @cap in all user
 172  * namespaces.
 173  */
 174 bool sk_capable(const struct sock *sk, int cap)
 175 {
 176         return sk_ns_capable(sk, &init_user_ns, cap);
 177 }
 178 EXPORT_SYMBOL(sk_capable);
 179
 180 /**
 181  * sk_net_capable - Network namespace socket capability test
 182  * @sk: Socket to use a capability on or through
 183  * @cap: The capability to use
 184  *
 185  * Test to see if the opener of the socket had when the socket was created
 186  * and the current process has the capability @cap over the network namespace
 187  * the socket is a member of.
 188  */
 189 bool sk_net_capable(const struct sock *sk, int cap)
 190 {
 191         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 192 }
 193 EXPORT_SYMBOL(sk_net_capable);
 194
 195 /*
 196  * Each address family might have different locking rules, so we have
 197  * one slock key per address family and separate keys for internal and
 198  * userspace sockets.
 199  */
 200 static struct lock_class_key af_family_keys[AF_MAX];
 201 static struct lock_class_key af_family_kern_keys[AF_MAX];
 202 static struct lock_class_key af_family_slock_keys[AF_MAX];
 203 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 204
 205 /*
 206  * Make lock validator output more readable. (we pre-construct these
 207  * strings build-time, so that runtime initialization of socket
 208  * locks is fast):
 209  */
 210
 211 #define _sock_locks(x)                                            \
 212   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 213   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 214   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 215   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 216   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 217   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 218   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 219   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 220   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 221   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 222   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 223   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 224   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 225   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 226   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 227   x "AF_MCTP"  , \
 228   x "AF_MAX"
 229
 230 static const char *const af_family_key_strings[AF_MAX+1] = {
 231         _sock_locks("sk_lock-")
 232 };
 233 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 234         _sock_locks("slock-")
 235 };
 236 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 237         _sock_locks("clock-")
 238 };
 239
 240 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 241         _sock_locks("k-sk_lock-")
 242 };
 243 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 244         _sock_locks("k-slock-")
 245 };
 246 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 247         _sock_locks("k-clock-")
 248 };
 249 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 250         _sock_locks("rlock-")
 251 };
 252 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 253         _sock_locks("wlock-")
 254 };
 255 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 256         _sock_locks("elock-")
 257 };
 258
 259 /*
 260  * sk_callback_lock and sk queues locking rules are per-address-family,
 261  * so split the lock classes by using a per-AF key:
 262  */
 263 static struct lock_class_key af_callback_keys[AF_MAX];
 264 static struct lock_class_key af_rlock_keys[AF_MAX];
 265 static struct lock_class_key af_wlock_keys[AF_MAX];
 266 static struct lock_class_key af_elock_keys[AF_MAX];
 267 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 268
 269 /* Run time adjustable parameters. */
 270 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 271 EXPORT_SYMBOL(sysctl_wmem_max);
 272 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 273 EXPORT_SYMBOL(sysctl_rmem_max);
 274 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 275 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 276
 277 /* Maximal space eaten by iovec or ancillary data plus some space */
 278 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 279 EXPORT_SYMBOL(sysctl_optmem_max);
 280
 281 int sysctl_tstamp_allow_data __read_mostly = 1;
 282
 283 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 284 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 285
 286 /**
 287  * sk_set_memalloc - sets %SOCK_MEMALLOC
 288  * @sk: socket to set it on
 289  *
 290  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 291  * It's the responsibility of the admin to adjust min_free_kbytes
 292  * to meet the requirements
 293  */
 294 void sk_set_memalloc(struct sock *sk)
 295 {
 296         sock_set_flag(sk, SOCK_MEMALLOC);
 297         sk->sk_allocation |= __GFP_MEMALLOC;
 298         static_branch_inc(&memalloc_socks_key);
 299 }
 300 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 301
 302 void sk_clear_memalloc(struct sock *sk)
 303 {
 304         sock_reset_flag(sk, SOCK_MEMALLOC);
 305         sk->sk_allocation &= ~__GFP_MEMALLOC;
 306         static_branch_dec(&memalloc_socks_key);
 307
 308         /*
 309          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 310          * progress of swapping. SOCK_MEMALLOC may be cleared while
 311          * it has rmem allocations due to the last swapfile being deactivated
 312          * but there is a risk that the socket is unusable due to exceeding
 313          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 314          */
 315         sk_mem_reclaim(sk);
 316 }
 317 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 318
 319 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 320 {
 321         int ret;
 322         unsigned int noreclaim_flag;
 323
 324         /* these should have been dropped before queueing */
 325         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 326
 327         noreclaim_flag = memalloc_noreclaim_save();
 328         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 329                                  tcp_v6_do_rcv,
 330                                  tcp_v4_do_rcv,
 331                                  sk, skb);
 332         memalloc_noreclaim_restore(noreclaim_flag);
 333
 334         return ret;
 335 }
 336 EXPORT_SYMBOL(__sk_backlog_rcv);
 337
 338 void sk_error_report(struct sock *sk)
 339 {
 340         sk->sk_error_report(sk);
 341
 342         switch (sk->sk_family) {
 343         case AF_INET:
 344                 fallthrough;
 345         case AF_INET6:
 346                 trace_inet_sk_error_report(sk);
 347                 break;
 348         default:
 349                 break;
 350         }
 351 }
 352 EXPORT_SYMBOL(sk_error_report);
 353
 354 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 355 {
 356         struct __kernel_sock_timeval tv;
 357
 358         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 359                 tv.tv_sec = 0;
 360                 tv.tv_usec = 0;
 361         } else {
 362                 tv.tv_sec = timeo / HZ;
 363                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 364         }
 365
 366         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 367                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 368                 *(struct old_timeval32 *)optval = tv32;
 369                 return sizeof(tv32);
 370         }
 371
 372         if (old_timeval) {
 373                 struct __kernel_old_timeval old_tv;
 374                 old_tv.tv_sec = tv.tv_sec;
 375                 old_tv.tv_usec = tv.tv_usec;
 376                 *(struct __kernel_old_timeval *)optval = old_tv;
 377                 return sizeof(old_tv);
 378         }
 379
 380         *(struct __kernel_sock_timeval *)optval = tv;
 381         return sizeof(tv);
 382 }
 383 EXPORT_SYMBOL(sock_get_timeout);
 384
 385 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 386                            sockptr_t optval, int optlen, bool old_timeval)
 387 {
 388         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 389                 struct old_timeval32 tv32;
 390
 391                 if (optlen < sizeof(tv32))
 392                         return -EINVAL;
 393
 394                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 395                         return -EFAULT;
 396                 tv->tv_sec = tv32.tv_sec;
 397                 tv->tv_usec = tv32.tv_usec;
 398         } else if (old_timeval) {
 399                 struct __kernel_old_timeval old_tv;
 400
 401                 if (optlen < sizeof(old_tv))
 402                         return -EINVAL;
 403                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 404                         return -EFAULT;
 405                 tv->tv_sec = old_tv.tv_sec;
 406                 tv->tv_usec = old_tv.tv_usec;
 407         } else {
 408                 if (optlen < sizeof(*tv))
 409                         return -EINVAL;
 410                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 411                         return -EFAULT;
 412         }
 413
 414         return 0;
 415 }
 416 EXPORT_SYMBOL(sock_copy_user_timeval);
 417
 418 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 419                             bool old_timeval)
 420 {
 421         struct __kernel_sock_timeval tv;
 422         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 423
 424         if (err)
 425                 return err;
 426
 427         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 428                 return -EDOM;
 429
 430         if (tv.tv_sec < 0) {
 431                 static int warned __read_mostly;
 432
 433                 *timeo_p = 0;
 434                 if (warned < 10 && net_ratelimit()) {
 435                         warned++;
 436                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 437                                 __func__, current->comm, task_pid_nr(current));
 438                 }
 439                 return 0;
 440         }
 441         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 442         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 443                 return 0;
 444         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1))
 445                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec, USEC_PER_SEC / HZ);
 446         return 0;
 447 }
 448
 449 static bool sock_needs_netstamp(const struct sock *sk)
 450 {
 451         switch (sk->sk_family) {
 452         case AF_UNSPEC:
 453         case AF_UNIX:
 454                 return false;
 455         default:
 456                 return true;
 457         }
 458 }
 459
 460 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 461 {
 462         if (sk->sk_flags & flags) {
 463                 sk->sk_flags &= ~flags;
 464                 if (sock_needs_netstamp(sk) &&
 465                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 466                         net_disable_timestamp();
 467         }
 468 }
 469
 470
 471 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 472 {
 473         unsigned long flags;
 474         struct sk_buff_head *list = &sk->sk_receive_queue;
 475
 476         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 477                 atomic_inc(&sk->sk_drops);
 478                 trace_sock_rcvqueue_full(sk, skb);
 479                 return -ENOMEM;
 480         }
 481
 482         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 483                 atomic_inc(&sk->sk_drops);
 484                 return -ENOBUFS;
 485         }
 486
 487         skb->dev = NULL;
 488         skb_set_owner_r(skb, sk);
 489
 490         /* we escape from rcu protected region, make sure we dont leak
 491          * a norefcounted dst
 492          */
 493         skb_dst_force(skb);
 494
 495         spin_lock_irqsave(&list->lock, flags);
 496         sock_skb_set_dropcount(sk, skb);
 497         __skb_queue_tail(list, skb);
 498         spin_unlock_irqrestore(&list->lock, flags);
 499
 500         if (!sock_flag(sk, SOCK_DEAD))
 501                 sk->sk_data_ready(sk);
 502         return 0;
 503 }
 504 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 505
 506 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 507 {
 508         int err;
 509
 510         err = sk_filter(sk, skb);
 511         if (err)
 512                 return err;
 513
 514         return __sock_queue_rcv_skb(sk, skb);
 515 }
 516 EXPORT_SYMBOL(sock_queue_rcv_skb);
 517
 518 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 519                      const int nested, unsigned int trim_cap, bool refcounted)
 520 {
 521         int rc = NET_RX_SUCCESS;
 522
 523         if (sk_filter_trim_cap(sk, skb, trim_cap))
 524                 goto discard_and_relse;
 525
 526         skb->dev = NULL;
 527
 528         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 529                 atomic_inc(&sk->sk_drops);
 530                 goto discard_and_relse;
 531         }
 532         if (nested)
 533                 bh_lock_sock_nested(sk);
 534         else
 535                 bh_lock_sock(sk);
 536         if (!sock_owned_by_user(sk)) {
 537                 /*
 538                  * trylock + unlock semantics:
 539                  */
 540                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 541
 542                 rc = sk_backlog_rcv(sk, skb);
 543
 544                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 545         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 546                 bh_unlock_sock(sk);
 547                 atomic_inc(&sk->sk_drops);
 548                 goto discard_and_relse;
 549         }
 550
 551         bh_unlock_sock(sk);
 552 out:
 553         if (refcounted)
 554                 sock_put(sk);
 555         return rc;
 556 discard_and_relse:
 557         kfree_skb(skb);
 558         goto out;
 559 }
 560 EXPORT_SYMBOL(__sk_receive_skb);
 561
 562 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 563                                                           u32));
 564 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 565                                                            u32));
 566 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 567 {
 568         struct dst_entry *dst = __sk_dst_get(sk);
 569
 570         if (dst && dst->obsolete &&
 571             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 572                                dst, cookie) == NULL) {
 573                 sk_tx_queue_clear(sk);
 574                 sk->sk_dst_pending_confirm = 0;
 575                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 576                 dst_release(dst);
 577                 return NULL;
 578         }
 579
 580         return dst;
 581 }
 582 EXPORT_SYMBOL(__sk_dst_check);
 583
 584 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 585 {
 586         struct dst_entry *dst = sk_dst_get(sk);
 587
 588         if (dst && dst->obsolete &&
 589             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 590                                dst, cookie) == NULL) {
 591                 sk_dst_reset(sk);
 592                 dst_release(dst);
 593                 return NULL;
 594         }
 595
 596         return dst;
 597 }
 598 EXPORT_SYMBOL(sk_dst_check);
 599
 600 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 601 {
 602         int ret = -ENOPROTOOPT;
 603 #ifdef CONFIG_NETDEVICES
 604         struct net *net = sock_net(sk);
 605
 606         /* Sorry... */
 607         ret = -EPERM;
 608         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 609                 goto out;
 610
 611         ret = -EINVAL;
 612         if (ifindex < 0)
 613                 goto out;
 614
 615         sk->sk_bound_dev_if = ifindex;
 616         if (sk->sk_prot->rehash)
 617                 sk->sk_prot->rehash(sk);
 618         sk_dst_reset(sk);
 619
 620         ret = 0;
 621
 622 out:
 623 #endif
 624
 625         return ret;
 626 }
 627
 628 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 629 {
 630         int ret;
 631
 632         if (lock_sk)
 633                 lock_sock(sk);
 634         ret = sock_bindtoindex_locked(sk, ifindex);
 635         if (lock_sk)
 636                 release_sock(sk);
 637
 638         return ret;
 639 }
 640 EXPORT_SYMBOL(sock_bindtoindex);
 641
 642 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 643 {
 644         int ret = -ENOPROTOOPT;
 645 #ifdef CONFIG_NETDEVICES
 646         struct net *net = sock_net(sk);
 647         char devname[IFNAMSIZ];
 648         int index;
 649
 650         ret = -EINVAL;
 651         if (optlen < 0)
 652                 goto out;
 653
 654         /* Bind this socket to a particular device like "eth0",
 655          * as specified in the passed interface name. If the
 656          * name is "" or the option length is zero the socket
 657          * is not bound.
 658          */
 659         if (optlen > IFNAMSIZ - 1)
 660                 optlen = IFNAMSIZ - 1;
 661         memset(devname, 0, sizeof(devname));
 662
 663         ret = -EFAULT;
 664         if (copy_from_sockptr(devname, optval, optlen))
 665                 goto out;
 666
 667         index = 0;
 668         if (devname[0] != '\0') {
 669                 struct net_device *dev;
 670
 671                 rcu_read_lock();
 672                 dev = dev_get_by_name_rcu(net, devname);
 673                 if (dev)
 674                         index = dev->ifindex;
 675                 rcu_read_unlock();
 676                 ret = -ENODEV;
 677                 if (!dev)
 678                         goto out;
 679         }
 680
 681         return sock_bindtoindex(sk, index, true);
 682 out:
 683 #endif
 684
 685         return ret;
 686 }
 687
 688 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 689                                 int __user *optlen, int len)
 690 {
 691         int ret = -ENOPROTOOPT;
 692 #ifdef CONFIG_NETDEVICES
 693         struct net *net = sock_net(sk);
 694         char devname[IFNAMSIZ];
 695
 696         if (sk->sk_bound_dev_if == 0) {
 697                 len = 0;
 698                 goto zero;
 699         }
 700
 701         ret = -EINVAL;
 702         if (len < IFNAMSIZ)
 703                 goto out;
 704
 705         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 706         if (ret)
 707                 goto out;
 708
 709         len = strlen(devname) + 1;
 710
 711         ret = -EFAULT;
 712         if (copy_to_user(optval, devname, len))
 713                 goto out;
 714
 715 zero:
 716         ret = -EFAULT;
 717         if (put_user(len, optlen))
 718                 goto out;
 719
 720         ret = 0;
 721
 722 out:
 723 #endif
 724
 725         return ret;
 726 }
 727
 728 bool sk_mc_loop(struct sock *sk)
 729 {
 730         if (dev_recursion_level())
 731                 return false;
 732         if (!sk)
 733                 return true;
 734         switch (sk->sk_family) {
 735         case AF_INET:
 736                 return inet_sk(sk)->mc_loop;
 737 #if IS_ENABLED(CONFIG_IPV6)
 738         case AF_INET6:
 739                 return inet6_sk(sk)->mc_loop;
 740 #endif
 741         }
 742         WARN_ON_ONCE(1);
 743         return true;
 744 }
 745 EXPORT_SYMBOL(sk_mc_loop);
 746
 747 void sock_set_reuseaddr(struct sock *sk)
 748 {
 749         lock_sock(sk);
 750         sk->sk_reuse = SK_CAN_REUSE;
 751         release_sock(sk);
 752 }
 753 EXPORT_SYMBOL(sock_set_reuseaddr);
 754
 755 void sock_set_reuseport(struct sock *sk)
 756 {
 757         lock_sock(sk);
 758         sk->sk_reuseport = true;
 759         release_sock(sk);
 760 }
 761 EXPORT_SYMBOL(sock_set_reuseport);
 762
 763 void sock_no_linger(struct sock *sk)
 764 {
 765         lock_sock(sk);
 766         sk->sk_lingertime = 0;
 767         sock_set_flag(sk, SOCK_LINGER);
 768         release_sock(sk);
 769 }
 770 EXPORT_SYMBOL(sock_no_linger);
 771
 772 void sock_set_priority(struct sock *sk, u32 priority)
 773 {
 774         lock_sock(sk);
 775         sk->sk_priority = priority;
 776         release_sock(sk);
 777 }
 778 EXPORT_SYMBOL(sock_set_priority);
 779
 780 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 781 {
 782         lock_sock(sk);
 783         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 784                 sk->sk_sndtimeo = secs * HZ;
 785         else
 786                 sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
 787         release_sock(sk);
 788 }
 789 EXPORT_SYMBOL(sock_set_sndtimeo);
 790
 791 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 792 {
 793         if (val)  {
 794                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 795                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 796                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 797                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 798         } else {
 799                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 800                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 801         }
 802 }
 803
 804 void sock_enable_timestamps(struct sock *sk)
 805 {
 806         lock_sock(sk);
 807         __sock_set_timestamps(sk, true, false, true);
 808         release_sock(sk);
 809 }
 810 EXPORT_SYMBOL(sock_enable_timestamps);
 811
 812 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 813 {
 814         switch (optname) {
 815         case SO_TIMESTAMP_OLD:
 816                 __sock_set_timestamps(sk, valbool, false, false);
 817                 break;
 818         case SO_TIMESTAMP_NEW:
 819                 __sock_set_timestamps(sk, valbool, true, false);
 820                 break;
 821         case SO_TIMESTAMPNS_OLD:
 822                 __sock_set_timestamps(sk, valbool, false, true);
 823                 break;
 824         case SO_TIMESTAMPNS_NEW:
 825                 __sock_set_timestamps(sk, valbool, true, true);
 826                 break;
 827         }
 828 }
 829
 830 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 831 {
 832         struct net *net = sock_net(sk);
 833         struct net_device *dev = NULL;
 834         bool match = false;
 835         int *vclock_index;
 836         int i, num;
 837
 838         if (sk->sk_bound_dev_if)
 839                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 840
 841         if (!dev) {
 842                 pr_err("%s: sock not bind to device\n", __func__);
 843                 return -EOPNOTSUPP;
 844         }
 845
 846         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 847         for (i = 0; i < num; i++) {
 848                 if (*(vclock_index + i) == phc_index) {
 849                         match = true;
 850                         break;
 851                 }
 852         }
 853
 854         if (num > 0)
 855                 kfree(vclock_index);
 856
 857         if (!match)
 858                 return -EINVAL;
 859
 860         sk->sk_bind_phc = phc_index;
 861
 862         return 0;
 863 }
 864
 865 int sock_set_timestamping(struct sock *sk, int optname,
 866                           struct so_timestamping timestamping)
 867 {
 868         int val = timestamping.flags;
 869         int ret;
 870
 871         if (val & ~SOF_TIMESTAMPING_MASK)
 872                 return -EINVAL;
 873
 874         if (val & SOF_TIMESTAMPING_OPT_ID &&
 875             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 876                 if (sk_is_tcp(sk)) {
 877                         if ((1 << sk->sk_state) &
 878                             (TCPF_CLOSE | TCPF_LISTEN))
 879                                 return -EINVAL;
 880                         sk->sk_tskey = tcp_sk(sk)->snd_una;
 881                 } else {
 882                         sk->sk_tskey = 0;
 883                 }
 884         }
 885
 886         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 887             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 888                 return -EINVAL;
 889
 890         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 891                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 892                 if (ret)
 893                         return ret;
 894         }
 895
 896         sk->sk_tsflags = val;
 897         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 898
 899         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 900                 sock_enable_timestamp(sk,
 901                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 902         else
 903                 sock_disable_timestamp(sk,
 904                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 905         return 0;
 906 }
 907
 908 void sock_set_keepalive(struct sock *sk)
 909 {
 910         lock_sock(sk);
 911         if (sk->sk_prot->keepalive)
 912                 sk->sk_prot->keepalive(sk, true);
 913         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 914         release_sock(sk);
 915 }
 916 EXPORT_SYMBOL(sock_set_keepalive);
 917
 918 static void __sock_set_rcvbuf(struct sock *sk, int val)
 919 {
 920         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 921          * as a negative value.
 922          */
 923         val = min_t(int, val, INT_MAX / 2);
 924         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 925
 926         /* We double it on the way in to account for "struct sk_buff" etc.
 927          * overhead.   Applications assume that the SO_RCVBUF setting they make
 928          * will allow that much actual data to be received on that socket.
 929          *
 930          * Applications are unaware that "struct sk_buff" and other overheads
 931          * allocate from the receive buffer during socket buffer allocation.
 932          *
 933          * And after considering the possible alternatives, returning the value
 934          * we actually used in getsockopt is the most desirable behavior.
 935          */
 936         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 937 }
 938
 939 void sock_set_rcvbuf(struct sock *sk, int val)
 940 {
 941         lock_sock(sk);
 942         __sock_set_rcvbuf(sk, val);
 943         release_sock(sk);
 944 }
 945 EXPORT_SYMBOL(sock_set_rcvbuf);
 946
 947 static void __sock_set_mark(struct sock *sk, u32 val)
 948 {
 949         if (val != sk->sk_mark) {
 950                 sk->sk_mark = val;
 951                 sk_dst_reset(sk);
 952         }
 953 }
 954
 955 void sock_set_mark(struct sock *sk, u32 val)
 956 {
 957         lock_sock(sk);
 958         __sock_set_mark(sk, val);
 959         release_sock(sk);
 960 }
 961 EXPORT_SYMBOL(sock_set_mark);
 962
 963 static void sock_release_reserved_memory(struct sock *sk, int bytes)
 964 {
 965         /* Round down bytes to multiple of pages */
 966         bytes &= ~(SK_MEM_QUANTUM - 1);
 967
 968         WARN_ON(bytes > sk->sk_reserved_mem);
 969         sk->sk_reserved_mem -= bytes;
 970         sk_mem_reclaim(sk);
 971 }
 972
 973 static int sock_reserve_memory(struct sock *sk, int bytes)
 974 {
 975         long allocated;
 976         bool charged;
 977         int pages;
 978
 979         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
 980                 return -EOPNOTSUPP;
 981
 982         if (!bytes)
 983                 return 0;
 984
 985         pages = sk_mem_pages(bytes);
 986
 987         /* pre-charge to memcg */
 988         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
 989                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
 990         if (!charged)
 991                 return -ENOMEM;
 992
 993         /* pre-charge to forward_alloc */
 994         allocated = sk_memory_allocated_add(sk, pages);
 995         /* If the system goes into memory pressure with this
 996          * precharge, give up and return error.
 997          */
 998         if (allocated > sk_prot_mem_limits(sk, 1)) {
 999                 sk_memory_allocated_sub(sk, pages);
1000                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1001                 return -ENOMEM;
1002         }
1003         sk->sk_forward_alloc += pages << SK_MEM_QUANTUM_SHIFT;
1004
1005         sk->sk_reserved_mem += pages << SK_MEM_QUANTUM_SHIFT;
1006
1007         return 0;
1008 }
1009
1010 /*
1011  *      This is meant for all protocols to use and covers goings on
1012  *      at the socket level. Everything here is generic.
1013  */
1014
1015 int sock_setsockopt(struct socket *sock, int level, int optname,
1016                     sockptr_t optval, unsigned int optlen)
1017 {
1018         struct so_timestamping timestamping;
1019         struct sock_txtime sk_txtime;
1020         struct sock *sk = sock->sk;
1021         int val;
1022         int valbool;
1023         struct linger ling;
1024         int ret = 0;
1025
1026         /*
1027          *      Options without arguments
1028          */
1029
1030         if (optname == SO_BINDTODEVICE)
1031                 return sock_setbindtodevice(sk, optval, optlen);
1032
1033         if (optlen < sizeof(int))
1034                 return -EINVAL;
1035
1036         if (copy_from_sockptr(&val, optval, sizeof(val)))
1037                 return -EFAULT;
1038
1039         valbool = val ? 1 : 0;
1040
1041         lock_sock(sk);
1042
1043         switch (optname) {
1044         case SO_DEBUG:
1045                 if (val && !capable(CAP_NET_ADMIN))
1046                         ret = -EACCES;
1047                 else
1048                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1049                 break;
1050         case SO_REUSEADDR:
1051                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1052                 break;
1053         case SO_REUSEPORT:
1054                 sk->sk_reuseport = valbool;
1055                 break;
1056         case SO_TYPE:
1057         case SO_PROTOCOL:
1058         case SO_DOMAIN:
1059         case SO_ERROR:
1060                 ret = -ENOPROTOOPT;
1061                 break;
1062         case SO_DONTROUTE:
1063                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1064                 sk_dst_reset(sk);
1065                 break;
1066         case SO_BROADCAST:
1067                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1068                 break;
1069         case SO_SNDBUF:
1070                 /* Don't error on this BSD doesn't and if you think
1071                  * about it this is right. Otherwise apps have to
1072                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1073                  * are treated in BSD as hints
1074                  */
1075                 val = min_t(u32, val, sysctl_wmem_max);
1076 set_sndbuf:
1077                 /* Ensure val * 2 fits into an int, to prevent max_t()
1078                  * from treating it as a negative value.
1079                  */
1080                 val = min_t(int, val, INT_MAX / 2);
1081                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1082                 WRITE_ONCE(sk->sk_sndbuf,
1083                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1084                 /* Wake up sending tasks if we upped the value. */
1085                 sk->sk_write_space(sk);
1086                 break;
1087
1088         case SO_SNDBUFFORCE:
1089                 if (!capable(CAP_NET_ADMIN)) {
1090                         ret = -EPERM;
1091                         break;
1092                 }
1093
1094                 /* No negative values (to prevent underflow, as val will be
1095                  * multiplied by 2).
1096                  */
1097                 if (val < 0)
1098                         val = 0;
1099                 goto set_sndbuf;
1100
1101         case SO_RCVBUF:
1102                 /* Don't error on this BSD doesn't and if you think
1103                  * about it this is right. Otherwise apps have to
1104                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1105                  * are treated in BSD as hints
1106                  */
1107                 __sock_set_rcvbuf(sk, min_t(u32, val, sysctl_rmem_max));
1108                 break;
1109
1110         case SO_RCVBUFFORCE:
1111                 if (!capable(CAP_NET_ADMIN)) {
1112                         ret = -EPERM;
1113                         break;
1114                 }
1115
1116                 /* No negative values (to prevent underflow, as val will be
1117                  * multiplied by 2).
1118                  */
1119                 __sock_set_rcvbuf(sk, max(val, 0));
1120                 break;
1121
1122         case SO_KEEPALIVE:
1123                 if (sk->sk_prot->keepalive)
1124                         sk->sk_prot->keepalive(sk, valbool);
1125                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1126                 break;
1127
1128         case SO_OOBINLINE:
1129                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1130                 break;
1131
1132         case SO_NO_CHECK:
1133                 sk->sk_no_check_tx = valbool;
1134                 break;
1135
1136         case SO_PRIORITY:
1137                 if ((val >= 0 && val <= 6) ||
1138                     ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1139                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1140                         sk->sk_priority = val;
1141                 else
1142                         ret = -EPERM;
1143                 break;
1144
1145         case SO_LINGER:
1146                 if (optlen < sizeof(ling)) {
1147                         ret = -EINVAL;  /* 1003.1g */
1148                         break;
1149                 }
1150                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1151                         ret = -EFAULT;
1152                         break;
1153                 }
1154                 if (!ling.l_onoff)
1155                         sock_reset_flag(sk, SOCK_LINGER);
1156                 else {
1157 #if (BITS_PER_LONG == 32)
1158                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
1159                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
1160                         else
1161 #endif
1162                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
1163                         sock_set_flag(sk, SOCK_LINGER);
1164                 }
1165                 break;
1166
1167         case SO_BSDCOMPAT:
1168                 break;
1169
1170         case SO_PASSCRED:
1171                 if (valbool)
1172                         set_bit(SOCK_PASSCRED, &sock->flags);
1173                 else
1174                         clear_bit(SOCK_PASSCRED, &sock->flags);
1175                 break;
1176
1177         case SO_TIMESTAMP_OLD:
1178         case SO_TIMESTAMP_NEW:
1179         case SO_TIMESTAMPNS_OLD:
1180         case SO_TIMESTAMPNS_NEW:
1181                 sock_set_timestamp(sk, optname, valbool);
1182                 break;
1183
1184         case SO_TIMESTAMPING_NEW:
1185         case SO_TIMESTAMPING_OLD:
1186                 if (optlen == sizeof(timestamping)) {
1187                         if (copy_from_sockptr(&timestamping, optval,
1188                                               sizeof(timestamping))) {
1189                                 ret = -EFAULT;
1190                                 break;
1191                         }
1192                 } else {
1193                         memset(&timestamping, 0, sizeof(timestamping));
1194                         timestamping.flags = val;
1195                 }
1196                 ret = sock_set_timestamping(sk, optname, timestamping);
1197                 break;
1198
1199         case SO_RCVLOWAT:
1200                 if (val < 0)
1201                         val = INT_MAX;
1202                 if (sock->ops->set_rcvlowat)
1203                         ret = sock->ops->set_rcvlowat(sk, val);
1204                 else
1205                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1206                 break;
1207
1208         case SO_RCVTIMEO_OLD:
1209         case SO_RCVTIMEO_NEW:
1210                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1211                                        optlen, optname == SO_RCVTIMEO_OLD);
1212                 break;
1213
1214         case SO_SNDTIMEO_OLD:
1215         case SO_SNDTIMEO_NEW:
1216                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1217                                        optlen, optname == SO_SNDTIMEO_OLD);
1218                 break;
1219
1220         case SO_ATTACH_FILTER: {
1221                 struct sock_fprog fprog;
1222
1223                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1224                 if (!ret)
1225                         ret = sk_attach_filter(&fprog, sk);
1226                 break;
1227         }
1228         case SO_ATTACH_BPF:
1229                 ret = -EINVAL;
1230                 if (optlen == sizeof(u32)) {
1231                         u32 ufd;
1232
1233                         ret = -EFAULT;
1234                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1235                                 break;
1236
1237                         ret = sk_attach_bpf(ufd, sk);
1238                 }
1239                 break;
1240
1241         case SO_ATTACH_REUSEPORT_CBPF: {
1242                 struct sock_fprog fprog;
1243
1244                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1245                 if (!ret)
1246                         ret = sk_reuseport_attach_filter(&fprog, sk);
1247                 break;
1248         }
1249         case SO_ATTACH_REUSEPORT_EBPF:
1250                 ret = -EINVAL;
1251                 if (optlen == sizeof(u32)) {
1252                         u32 ufd;
1253
1254                         ret = -EFAULT;
1255                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1256                                 break;
1257
1258                         ret = sk_reuseport_attach_bpf(ufd, sk);
1259                 }
1260                 break;
1261
1262         case SO_DETACH_REUSEPORT_BPF:
1263                 ret = reuseport_detach_prog(sk);
1264                 break;
1265
1266         case SO_DETACH_FILTER:
1267                 ret = sk_detach_filter(sk);
1268                 break;
1269
1270         case SO_LOCK_FILTER:
1271                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1272                         ret = -EPERM;
1273                 else
1274                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1275                 break;
1276
1277         case SO_PASSSEC:
1278                 if (valbool)
1279                         set_bit(SOCK_PASSSEC, &sock->flags);
1280                 else
1281                         clear_bit(SOCK_PASSSEC, &sock->flags);
1282                 break;
1283         case SO_MARK:
1284                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1285                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1286                         ret = -EPERM;
1287                         break;
1288                 }
1289
1290                 __sock_set_mark(sk, val);
1291                 break;
1292
1293         case SO_RXQ_OVFL:
1294                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1295                 break;
1296
1297         case SO_WIFI_STATUS:
1298                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1299                 break;
1300
1301         case SO_PEEK_OFF:
1302                 if (sock->ops->set_peek_off)
1303                         ret = sock->ops->set_peek_off(sk, val);
1304                 else
1305                         ret = -EOPNOTSUPP;
1306                 break;
1307
1308         case SO_NOFCS:
1309                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1310                 break;
1311
1312         case SO_SELECT_ERR_QUEUE:
1313                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1314                 break;
1315
1316 #ifdef CONFIG_NET_RX_BUSY_POLL
1317         case SO_BUSY_POLL:
1318                 /* allow unprivileged users to decrease the value */
1319                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1320                         ret = -EPERM;
1321                 else {
1322                         if (val < 0)
1323                                 ret = -EINVAL;
1324                         else
1325                                 WRITE_ONCE(sk->sk_ll_usec, val);
1326                 }
1327                 break;
1328         case SO_PREFER_BUSY_POLL:
1329                 if (valbool && !capable(CAP_NET_ADMIN))
1330                         ret = -EPERM;
1331                 else
1332                         WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1333                 break;
1334         case SO_BUSY_POLL_BUDGET:
1335                 if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
1336                         ret = -EPERM;
1337                 } else {
1338                         if (val < 0 || val > U16_MAX)
1339                                 ret = -EINVAL;
1340                         else
1341                                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1342                 }
1343                 break;
1344 #endif
1345
1346         case SO_MAX_PACING_RATE:
1347                 {
1348                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1349
1350                 if (sizeof(ulval) != sizeof(val) &&
1351                     optlen >= sizeof(ulval) &&
1352                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1353                         ret = -EFAULT;
1354                         break;
1355                 }
1356                 if (ulval != ~0UL)
1357                         cmpxchg(&sk->sk_pacing_status,
1358                                 SK_PACING_NONE,
1359                                 SK_PACING_NEEDED);
1360                 sk->sk_max_pacing_rate = ulval;
1361                 sk->sk_pacing_rate = min(sk->sk_pacing_rate, ulval);
1362                 break;
1363                 }
1364         case SO_INCOMING_CPU:
1365                 WRITE_ONCE(sk->sk_incoming_cpu, val);
1366                 break;
1367
1368         case SO_CNX_ADVICE:
1369                 if (val == 1)
1370                         dst_negative_advice(sk);
1371                 break;
1372
1373         case SO_ZEROCOPY:
1374                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1375                         if (!(sk_is_tcp(sk) ||
1376                               (sk->sk_type == SOCK_DGRAM &&
1377                                sk->sk_protocol == IPPROTO_UDP)))
1378                                 ret = -ENOTSUPP;
1379                 } else if (sk->sk_family != PF_RDS) {
1380                         ret = -ENOTSUPP;
1381                 }
1382                 if (!ret) {
1383                         if (val < 0 || val > 1)
1384                                 ret = -EINVAL;
1385                         else
1386                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1387                 }
1388                 break;
1389
1390         case SO_TXTIME:
1391                 if (optlen != sizeof(struct sock_txtime)) {
1392                         ret = -EINVAL;
1393                         break;
1394                 } else if (copy_from_sockptr(&sk_txtime, optval,
1395                            sizeof(struct sock_txtime))) {
1396                         ret = -EFAULT;
1397                         break;
1398                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1399                         ret = -EINVAL;
1400                         break;
1401                 }
1402                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1403                  * scheduler has enough safe guards.
1404                  */
1405                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1406                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1407                         ret = -EPERM;
1408                         break;
1409                 }
1410                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1411                 sk->sk_clockid = sk_txtime.clockid;
1412                 sk->sk_txtime_deadline_mode =
1413                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1414                 sk->sk_txtime_report_errors =
1415                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1416                 break;
1417
1418         case SO_BINDTOIFINDEX:
1419                 ret = sock_bindtoindex_locked(sk, val);
1420                 break;
1421
1422         case SO_BUF_LOCK:
1423                 if (val & ~SOCK_BUF_LOCK_MASK) {
1424                         ret = -EINVAL;
1425                         break;
1426                 }
1427                 sk->sk_userlocks = val | (sk->sk_userlocks &
1428                                           ~SOCK_BUF_LOCK_MASK);
1429                 break;
1430
1431         case SO_RESERVE_MEM:
1432         {
1433                 int delta;
1434
1435                 if (val < 0) {
1436                         ret = -EINVAL;
1437                         break;
1438                 }
1439
1440                 delta = val - sk->sk_reserved_mem;
1441                 if (delta < 0)
1442                         sock_release_reserved_memory(sk, -delta);
1443                 else
1444                         ret = sock_reserve_memory(sk, delta);
1445                 break;
1446         }
1447
1448         default:
1449                 ret = -ENOPROTOOPT;
1450                 break;
1451         }
1452         release_sock(sk);
1453         return ret;
1454 }
1455 EXPORT_SYMBOL(sock_setsockopt);
1456
1457 static const struct cred *sk_get_peer_cred(struct sock *sk)
1458 {
1459         const struct cred *cred;
1460
1461         spin_lock(&sk->sk_peer_lock);
1462         cred = get_cred(sk->sk_peer_cred);
1463         spin_unlock(&sk->sk_peer_lock);
1464
1465         return cred;
1466 }
1467
1468 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1469                           struct ucred *ucred)
1470 {
1471         ucred->pid = pid_vnr(pid);
1472         ucred->uid = ucred->gid = -1;
1473         if (cred) {
1474                 struct user_namespace *current_ns = current_user_ns();
1475
1476                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1477                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1478         }
1479 }
1480
1481 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1482 {
1483         struct user_namespace *user_ns = current_user_ns();
1484         int i;
1485
1486         for (i = 0; i < src->ngroups; i++)
1487                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1488                         return -EFAULT;
1489
1490         return 0;
1491 }
1492
1493 int sock_getsockopt(struct socket *sock, int level, int optname,
1494                     char __user *optval, int __user *optlen)
1495 {
1496         struct sock *sk = sock->sk;
1497
1498         union {
1499                 int val;
1500                 u64 val64;
1501                 unsigned long ulval;
1502                 struct linger ling;
1503                 struct old_timeval32 tm32;
1504                 struct __kernel_old_timeval tm;
1505                 struct  __kernel_sock_timeval stm;
1506                 struct sock_txtime txtime;
1507                 struct so_timestamping timestamping;
1508         } v;
1509
1510         int lv = sizeof(int);
1511         int len;
1512
1513         if (get_user(len, optlen))
1514                 return -EFAULT;
1515         if (len < 0)
1516                 return -EINVAL;
1517
1518         memset(&v, 0, sizeof(v));
1519
1520         switch (optname) {
1521         case SO_DEBUG:
1522                 v.val = sock_flag(sk, SOCK_DBG);
1523                 break;
1524
1525         case SO_DONTROUTE:
1526                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1527                 break;
1528
1529         case SO_BROADCAST:
1530                 v.val = sock_flag(sk, SOCK_BROADCAST);
1531                 break;
1532
1533         case SO_SNDBUF:
1534                 v.val = sk->sk_sndbuf;
1535                 break;
1536
1537         case SO_RCVBUF:
1538                 v.val = sk->sk_rcvbuf;
1539                 break;
1540
1541         case SO_REUSEADDR:
1542                 v.val = sk->sk_reuse;
1543                 break;
1544
1545         case SO_REUSEPORT:
1546                 v.val = sk->sk_reuseport;
1547                 break;
1548
1549         case SO_KEEPALIVE:
1550                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1551                 break;
1552
1553         case SO_TYPE:
1554                 v.val = sk->sk_type;
1555                 break;
1556
1557         case SO_PROTOCOL:
1558                 v.val = sk->sk_protocol;
1559                 break;
1560
1561         case SO_DOMAIN:
1562                 v.val = sk->sk_family;
1563                 break;
1564
1565         case SO_ERROR:
1566                 v.val = -sock_error(sk);
1567                 if (v.val == 0)
1568                         v.val = xchg(&sk->sk_err_soft, 0);
1569                 break;
1570
1571         case SO_OOBINLINE:
1572                 v.val = sock_flag(sk, SOCK_URGINLINE);
1573                 break;
1574
1575         case SO_NO_CHECK:
1576                 v.val = sk->sk_no_check_tx;
1577                 break;
1578
1579         case SO_PRIORITY:
1580                 v.val = sk->sk_priority;
1581                 break;
1582
1583         case SO_LINGER:
1584                 lv              = sizeof(v.ling);
1585                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1586                 v.ling.l_linger = sk->sk_lingertime / HZ;
1587                 break;
1588
1589         case SO_BSDCOMPAT:
1590                 break;
1591
1592         case SO_TIMESTAMP_OLD:
1593                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1594                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1595                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1596                 break;
1597
1598         case SO_TIMESTAMPNS_OLD:
1599                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1600                 break;
1601
1602         case SO_TIMESTAMP_NEW:
1603                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1604                 break;
1605
1606         case SO_TIMESTAMPNS_NEW:
1607                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1608                 break;
1609
1610         case SO_TIMESTAMPING_OLD:
1611                 lv = sizeof(v.timestamping);
1612                 v.timestamping.flags = sk->sk_tsflags;
1613                 v.timestamping.bind_phc = sk->sk_bind_phc;
1614                 break;
1615
1616         case SO_RCVTIMEO_OLD:
1617         case SO_RCVTIMEO_NEW:
1618                 lv = sock_get_timeout(sk->sk_rcvtimeo, &v, SO_RCVTIMEO_OLD == optname);
1619                 break;
1620
1621         case SO_SNDTIMEO_OLD:
1622         case SO_SNDTIMEO_NEW:
1623                 lv = sock_get_timeout(sk->sk_sndtimeo, &v, SO_SNDTIMEO_OLD == optname);
1624                 break;
1625
1626         case SO_RCVLOWAT:
1627                 v.val = sk->sk_rcvlowat;
1628                 break;
1629
1630         case SO_SNDLOWAT:
1631                 v.val = 1;
1632                 break;
1633
1634         case SO_PASSCRED:
1635                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1636                 break;
1637
1638         case SO_PEERCRED:
1639         {
1640                 struct ucred peercred;
1641                 if (len > sizeof(peercred))
1642                         len = sizeof(peercred);
1643
1644                 spin_lock(&sk->sk_peer_lock);
1645                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1646                 spin_unlock(&sk->sk_peer_lock);
1647
1648                 if (copy_to_user(optval, &peercred, len))
1649                         return -EFAULT;
1650                 goto lenout;
1651         }
1652
1653         case SO_PEERGROUPS:
1654         {
1655                 const struct cred *cred;
1656                 int ret, n;
1657
1658                 cred = sk_get_peer_cred(sk);
1659                 if (!cred)
1660                         return -ENODATA;
1661
1662                 n = cred->group_info->ngroups;
1663                 if (len < n * sizeof(gid_t)) {
1664                         len = n * sizeof(gid_t);
1665                         put_cred(cred);
1666                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1667                 }
1668                 len = n * sizeof(gid_t);
1669
1670                 ret = groups_to_user((gid_t __user *)optval, cred->group_info);
1671                 put_cred(cred);
1672                 if (ret)
1673                         return ret;
1674                 goto lenout;
1675         }
1676
1677         case SO_PEERNAME:
1678         {
1679                 char address[128];
1680
1681                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1682                 if (lv < 0)
1683                         return -ENOTCONN;
1684                 if (lv < len)
1685                         return -EINVAL;
1686                 if (copy_to_user(optval, address, len))
1687                         return -EFAULT;
1688                 goto lenout;
1689         }
1690
1691         /* Dubious BSD thing... Probably nobody even uses it, but
1692          * the UNIX standard wants it for whatever reason... -DaveM
1693          */
1694         case SO_ACCEPTCONN:
1695                 v.val = sk->sk_state == TCP_LISTEN;
1696                 break;
1697
1698         case SO_PASSSEC:
1699                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1700                 break;
1701
1702         case SO_PEERSEC:
1703                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1704
1705         case SO_MARK:
1706                 v.val = sk->sk_mark;
1707                 break;
1708
1709         case SO_RXQ_OVFL:
1710                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1711                 break;
1712
1713         case SO_WIFI_STATUS:
1714                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1715                 break;
1716
1717         case SO_PEEK_OFF:
1718                 if (!sock->ops->set_peek_off)
1719                         return -EOPNOTSUPP;
1720
1721                 v.val = sk->sk_peek_off;
1722                 break;
1723         case SO_NOFCS:
1724                 v.val = sock_flag(sk, SOCK_NOFCS);
1725                 break;
1726
1727         case SO_BINDTODEVICE:
1728                 return sock_getbindtodevice(sk, optval, optlen, len);
1729
1730         case SO_GET_FILTER:
1731                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1732                 if (len < 0)
1733                         return len;
1734
1735                 goto lenout;
1736
1737         case SO_LOCK_FILTER:
1738                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1739                 break;
1740
1741         case SO_BPF_EXTENSIONS:
1742                 v.val = bpf_tell_extensions();
1743                 break;
1744
1745         case SO_SELECT_ERR_QUEUE:
1746                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1747                 break;
1748
1749 #ifdef CONFIG_NET_RX_BUSY_POLL
1750         case SO_BUSY_POLL:
1751                 v.val = sk->sk_ll_usec;
1752                 break;
1753         case SO_PREFER_BUSY_POLL:
1754                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1755                 break;
1756 #endif
1757
1758         case SO_MAX_PACING_RATE:
1759                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1760                         lv = sizeof(v.ulval);
1761                         v.ulval = sk->sk_max_pacing_rate;
1762                 } else {
1763                         /* 32bit version */
1764                         v.val = min_t(unsigned long, sk->sk_max_pacing_rate, ~0U);
1765                 }
1766                 break;
1767
1768         case SO_INCOMING_CPU:
1769                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1770                 break;
1771
1772         case SO_MEMINFO:
1773         {
1774                 u32 meminfo[SK_MEMINFO_VARS];
1775
1776                 sk_get_meminfo(sk, meminfo);
1777
1778                 len = min_t(unsigned int, len, sizeof(meminfo));
1779                 if (copy_to_user(optval, &meminfo, len))
1780                         return -EFAULT;
1781
1782                 goto lenout;
1783         }
1784
1785 #ifdef CONFIG_NET_RX_BUSY_POLL
1786         case SO_INCOMING_NAPI_ID:
1787                 v.val = READ_ONCE(sk->sk_napi_id);
1788
1789                 /* aggregate non-NAPI IDs down to 0 */
1790                 if (v.val < MIN_NAPI_ID)
1791                         v.val = 0;
1792
1793                 break;
1794 #endif
1795
1796         case SO_COOKIE:
1797                 lv = sizeof(u64);
1798                 if (len < lv)
1799                         return -EINVAL;
1800                 v.val64 = sock_gen_cookie(sk);
1801                 break;
1802
1803         case SO_ZEROCOPY:
1804                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1805                 break;
1806
1807         case SO_TXTIME:
1808                 lv = sizeof(v.txtime);
1809                 v.txtime.clockid = sk->sk_clockid;
1810                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1811                                   SOF_TXTIME_DEADLINE_MODE : 0;
1812                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1813                                   SOF_TXTIME_REPORT_ERRORS : 0;
1814                 break;
1815
1816         case SO_BINDTOIFINDEX:
1817                 v.val = sk->sk_bound_dev_if;
1818                 break;
1819
1820         case SO_NETNS_COOKIE:
1821                 lv = sizeof(u64);
1822                 if (len != lv)
1823                         return -EINVAL;
1824                 v.val64 = sock_net(sk)->net_cookie;
1825                 break;
1826
1827         case SO_BUF_LOCK:
1828                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1829                 break;
1830
1831         case SO_RESERVE_MEM:
1832                 v.val = sk->sk_reserved_mem;
1833                 break;
1834
1835         default:
1836                 /* We implement the SO_SNDLOWAT etc to not be settable
1837                  * (1003.1g 7).
1838                  */
1839                 return -ENOPROTOOPT;
1840         }
1841
1842         if (len > lv)
1843                 len = lv;
1844         if (copy_to_user(optval, &v, len))
1845                 return -EFAULT;
1846 lenout:
1847         if (put_user(len, optlen))
1848                 return -EFAULT;
1849         return 0;
1850 }
1851
1852 /*
1853  * Initialize an sk_lock.
1854  *
1855  * (We also register the sk_lock with the lock validator.)
1856  */
1857 static inline void sock_lock_init(struct sock *sk)
1858 {
1859         if (sk->sk_kern_sock)
1860                 sock_lock_init_class_and_name(
1861                         sk,
1862                         af_family_kern_slock_key_strings[sk->sk_family],
1863                         af_family_kern_slock_keys + sk->sk_family,
1864                         af_family_kern_key_strings[sk->sk_family],
1865                         af_family_kern_keys + sk->sk_family);
1866         else
1867                 sock_lock_init_class_and_name(
1868                         sk,
1869                         af_family_slock_key_strings[sk->sk_family],
1870                         af_family_slock_keys + sk->sk_family,
1871                         af_family_key_strings[sk->sk_family],
1872                         af_family_keys + sk->sk_family);
1873 }
1874
1875 /*
1876  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1877  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1878  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1879  */
1880 static void sock_copy(struct sock *nsk, const struct sock *osk)
1881 {
1882         const struct proto *prot = READ_ONCE(osk->sk_prot);
1883 #ifdef CONFIG_SECURITY_NETWORK
1884         void *sptr = nsk->sk_security;
1885 #endif
1886
1887         /* If we move sk_tx_queue_mapping out of the private section,
1888          * we must check if sk_tx_queue_clear() is called after
1889          * sock_copy() in sk_clone_lock().
1890          */
1891         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
1892                      offsetof(struct sock, sk_dontcopy_begin) ||
1893                      offsetof(struct sock, sk_tx_queue_mapping) >=
1894                      offsetof(struct sock, sk_dontcopy_end));
1895
1896         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1897
1898         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1899                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1900
1901 #ifdef CONFIG_SECURITY_NETWORK
1902         nsk->sk_security = sptr;
1903         security_sk_clone(osk, nsk);
1904 #endif
1905 }
1906
1907 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1908                 int family)
1909 {
1910         struct sock *sk;
1911         struct kmem_cache *slab;
1912
1913         slab = prot->slab;
1914         if (slab != NULL) {
1915                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1916                 if (!sk)
1917                         return sk;
1918                 if (want_init_on_alloc(priority))
1919                         sk_prot_clear_nulls(sk, prot->obj_size);
1920         } else
1921                 sk = kmalloc(prot->obj_size, priority);
1922
1923         if (sk != NULL) {
1924                 if (security_sk_alloc(sk, family, priority))
1925                         goto out_free;
1926
1927                 if (!try_module_get(prot->owner))
1928                         goto out_free_sec;
1929         }
1930
1931         return sk;
1932
1933 out_free_sec:
1934         security_sk_free(sk);
1935 out_free:
1936         if (slab != NULL)
1937                 kmem_cache_free(slab, sk);
1938         else
1939                 kfree(sk);
1940         return NULL;
1941 }
1942
1943 static void sk_prot_free(struct proto *prot, struct sock *sk)
1944 {
1945         struct kmem_cache *slab;
1946         struct module *owner;
1947
1948         owner = prot->owner;
1949         slab = prot->slab;
1950
1951         cgroup_sk_free(&sk->sk_cgrp_data);
1952         mem_cgroup_sk_free(sk);
1953         security_sk_free(sk);
1954         if (slab != NULL)
1955                 kmem_cache_free(slab, sk);
1956         else
1957                 kfree(sk);
1958         module_put(owner);
1959 }
1960
1961 /**
1962  *      sk_alloc - All socket objects are allocated here
1963  *      @net: the applicable net namespace
1964  *      @family: protocol family
1965  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1966  *      @prot: struct proto associated with this new sock instance
1967  *      @kern: is this to be a kernel socket?
1968  */
1969 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1970                       struct proto *prot, int kern)
1971 {
1972         struct sock *sk;
1973
1974         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1975         if (sk) {
1976                 sk->sk_family = family;
1977                 /*
1978                  * See comment in struct sock definition to understand
1979                  * why we need sk_prot_creator -acme
1980                  */
1981                 sk->sk_prot = sk->sk_prot_creator = prot;
1982                 sk->sk_kern_sock = kern;
1983                 sock_lock_init(sk);
1984                 sk->sk_net_refcnt = kern ? 0 : 1;
1985                 if (likely(sk->sk_net_refcnt)) {
1986                         get_net_track(net, &sk->ns_tracker, priority);
1987                         sock_inuse_add(net, 1);
1988                 }
1989
1990                 sock_net_set(sk, net);
1991                 refcount_set(&sk->sk_wmem_alloc, 1);
1992
1993                 mem_cgroup_sk_alloc(sk);
1994                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1995                 sock_update_classid(&sk->sk_cgrp_data);
1996                 sock_update_netprioidx(&sk->sk_cgrp_data);
1997                 sk_tx_queue_clear(sk);
1998         }
1999
2000         return sk;
2001 }
2002 EXPORT_SYMBOL(sk_alloc);
2003
2004 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2005  * grace period. This is the case for UDP sockets and TCP listeners.
2006  */
2007 static void __sk_destruct(struct rcu_head *head)
2008 {
2009         struct sock *sk = container_of(head, struct sock, sk_rcu);
2010         struct sk_filter *filter;
2011
2012         if (sk->sk_destruct)
2013                 sk->sk_destruct(sk);
2014
2015         filter = rcu_dereference_check(sk->sk_filter,
2016                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2017         if (filter) {
2018                 sk_filter_uncharge(sk, filter);
2019                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2020         }
2021
2022         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2023
2024 #ifdef CONFIG_BPF_SYSCALL
2025         bpf_sk_storage_free(sk);
2026 #endif
2027
2028         if (atomic_read(&sk->sk_omem_alloc))
2029                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2030                          __func__, atomic_read(&sk->sk_omem_alloc));
2031
2032         if (sk->sk_frag.page) {
2033                 put_page(sk->sk_frag.page);
2034                 sk->sk_frag.page = NULL;
2035         }
2036
2037         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2038         put_cred(sk->sk_peer_cred);
2039         put_pid(sk->sk_peer_pid);
2040
2041         if (likely(sk->sk_net_refcnt))
2042                 put_net_track(sock_net(sk), &sk->ns_tracker);
2043         sk_prot_free(sk->sk_prot_creator, sk);
2044 }
2045
2046 void sk_destruct(struct sock *sk)
2047 {
2048         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2049
2050         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2051                 reuseport_detach_sock(sk);
2052                 use_call_rcu = true;
2053         }
2054
2055         if (use_call_rcu)
2056                 call_rcu(&sk->sk_rcu, __sk_destruct);
2057         else
2058                 __sk_destruct(&sk->sk_rcu);
2059 }
2060
2061 static void __sk_free(struct sock *sk)
2062 {
2063         if (likely(sk->sk_net_refcnt))
2064                 sock_inuse_add(sock_net(sk), -1);
2065
2066         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2067                 sock_diag_broadcast_destroy(sk);
2068         else
2069                 sk_destruct(sk);
2070 }
2071
2072 void sk_free(struct sock *sk)
2073 {
2074         /*
2075          * We subtract one from sk_wmem_alloc and can know if
2076          * some packets are still in some tx queue.
2077          * If not null, sock_wfree() will call __sk_free(sk) later
2078          */
2079         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2080                 __sk_free(sk);
2081 }
2082 EXPORT_SYMBOL(sk_free);
2083
2084 static void sk_init_common(struct sock *sk)
2085 {
2086         skb_queue_head_init(&sk->sk_receive_queue);
2087         skb_queue_head_init(&sk->sk_write_queue);
2088         skb_queue_head_init(&sk->sk_error_queue);
2089
2090         rwlock_init(&sk->sk_callback_lock);
2091         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2092                         af_rlock_keys + sk->sk_family,
2093                         af_family_rlock_key_strings[sk->sk_family]);
2094         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2095                         af_wlock_keys + sk->sk_family,
2096                         af_family_wlock_key_strings[sk->sk_family]);
2097         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2098                         af_elock_keys + sk->sk_family,
2099                         af_family_elock_key_strings[sk->sk_family]);
2100         lockdep_set_class_and_name(&sk->sk_callback_lock,
2101                         af_callback_keys + sk->sk_family,
2102                         af_family_clock_key_strings[sk->sk_family]);
2103 }
2104
2105 /**
2106  *      sk_clone_lock - clone a socket, and lock its clone
2107  *      @sk: the socket to clone
2108  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2109  *
2110  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2111  */
2112 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2113 {
2114         struct proto *prot = READ_ONCE(sk->sk_prot);
2115         struct sk_filter *filter;
2116         bool is_charged = true;
2117         struct sock *newsk;
2118
2119         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2120         if (!newsk)
2121                 goto out;
2122
2123         sock_copy(newsk, sk);
2124
2125         newsk->sk_prot_creator = prot;
2126
2127         /* SANITY */
2128         if (likely(newsk->sk_net_refcnt)) {
2129                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2130                 sock_inuse_add(sock_net(newsk), 1);
2131         }
2132         sk_node_init(&newsk->sk_node);
2133         sock_lock_init(newsk);
2134         bh_lock_sock(newsk);
2135         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2136         newsk->sk_backlog.len = 0;
2137
2138         atomic_set(&newsk->sk_rmem_alloc, 0);
2139
2140         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2141         refcount_set(&newsk->sk_wmem_alloc, 1);
2142
2143         atomic_set(&newsk->sk_omem_alloc, 0);
2144         sk_init_common(newsk);
2145
2146         newsk->sk_dst_cache     = NULL;
2147         newsk->sk_dst_pending_confirm = 0;
2148         newsk->sk_wmem_queued   = 0;
2149         newsk->sk_forward_alloc = 0;
2150         newsk->sk_reserved_mem  = 0;
2151         atomic_set(&newsk->sk_drops, 0);
2152         newsk->sk_send_head     = NULL;
2153         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2154         atomic_set(&newsk->sk_zckey, 0);
2155
2156         sock_reset_flag(newsk, SOCK_DONE);
2157
2158         /* sk->sk_memcg will be populated at accept() time */
2159         newsk->sk_memcg = NULL;
2160
2161         cgroup_sk_clone(&newsk->sk_cgrp_data);
2162
2163         rcu_read_lock();
2164         filter = rcu_dereference(sk->sk_filter);
2165         if (filter != NULL)
2166                 /* though it's an empty new sock, the charging may fail
2167                  * if sysctl_optmem_max was changed between creation of
2168                  * original socket and cloning
2169                  */
2170                 is_charged = sk_filter_charge(newsk, filter);
2171         RCU_INIT_POINTER(newsk->sk_filter, filter);
2172         rcu_read_unlock();
2173
2174         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2175                 /* We need to make sure that we don't uncharge the new
2176                  * socket if we couldn't charge it in the first place
2177                  * as otherwise we uncharge the parent's filter.
2178                  */
2179                 if (!is_charged)
2180                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2181                 sk_free_unlock_clone(newsk);
2182                 newsk = NULL;
2183                 goto out;
2184         }
2185         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2186
2187         if (bpf_sk_storage_clone(sk, newsk)) {
2188                 sk_free_unlock_clone(newsk);
2189                 newsk = NULL;
2190                 goto out;
2191         }
2192
2193         /* Clear sk_user_data if parent had the pointer tagged
2194          * as not suitable for copying when cloning.
2195          */
2196         if (sk_user_data_is_nocopy(newsk))
2197                 newsk->sk_user_data = NULL;
2198
2199         newsk->sk_err      = 0;
2200         newsk->sk_err_soft = 0;
2201         newsk->sk_priority = 0;
2202         newsk->sk_incoming_cpu = raw_smp_processor_id();
2203
2204         /* Before updating sk_refcnt, we must commit prior changes to memory
2205          * (Documentation/RCU/rculist_nulls.rst for details)
2206          */
2207         smp_wmb();
2208         refcount_set(&newsk->sk_refcnt, 2);
2209
2210         /* Increment the counter in the same struct proto as the master
2211          * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
2212          * is the same as sk->sk_prot->socks, as this field was copied
2213          * with memcpy).
2214          *
2215          * This _changes_ the previous behaviour, where
2216          * tcp_create_openreq_child always was incrementing the
2217          * equivalent to tcp_prot->socks (inet_sock_nr), so this have
2218          * to be taken into account in all callers. -acme
2219          */
2220         sk_refcnt_debug_inc(newsk);
2221         sk_set_socket(newsk, NULL);
2222         sk_tx_queue_clear(newsk);
2223         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2224
2225         if (newsk->sk_prot->sockets_allocated)
2226                 sk_sockets_allocated_inc(newsk);
2227
2228         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2229                 net_enable_timestamp();
2230 out:
2231         return newsk;
2232 }
2233 EXPORT_SYMBOL_GPL(sk_clone_lock);
2234
2235 void sk_free_unlock_clone(struct sock *sk)
2236 {
2237         /* It is still raw copy of parent, so invalidate
2238          * destructor and make plain sk_free() */
2239         sk->sk_destruct = NULL;
2240         bh_unlock_sock(sk);
2241         sk_free(sk);
2242 }
2243 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2244
2245 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2246 {
2247         u32 max_segs = 1;
2248
2249         sk_dst_set(sk, dst);
2250         sk->sk_route_caps = dst->dev->features;
2251         if (sk_is_tcp(sk))
2252                 sk->sk_route_caps |= NETIF_F_GSO;
2253         if (sk->sk_route_caps & NETIF_F_GSO)
2254                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2255         if (unlikely(sk->sk_gso_disabled))
2256                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2257         if (sk_can_gso(sk)) {
2258                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2259                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2260                 } else {
2261                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2262                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_size() */
2263                         sk->sk_gso_max_size = READ_ONCE(dst->dev->gso_max_size);
2264                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2265                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2266                 }
2267         }
2268         sk->sk_gso_max_segs = max_segs;
2269 }
2270 EXPORT_SYMBOL_GPL(sk_setup_caps);
2271
2272 /*
2273  *      Simple resource managers for sockets.
2274  */
2275
2276
2277 /*
2278  * Write buffer destructor automatically called from kfree_skb.
2279  */
2280 void sock_wfree(struct sk_buff *skb)
2281 {
2282         struct sock *sk = skb->sk;
2283         unsigned int len = skb->truesize;
2284
2285         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2286                 /*
2287                  * Keep a reference on sk_wmem_alloc, this will be released
2288                  * after sk_write_space() call
2289                  */
2290                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2291                 sk->sk_write_space(sk);
2292                 len = 1;
2293         }
2294         /*
2295          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2296          * could not do because of in-flight packets
2297          */
2298         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2299                 __sk_free(sk);
2300 }
2301 EXPORT_SYMBOL(sock_wfree);
2302
2303 /* This variant of sock_wfree() is used by TCP,
2304  * since it sets SOCK_USE_WRITE_QUEUE.
2305  */
2306 void __sock_wfree(struct sk_buff *skb)
2307 {
2308         struct sock *sk = skb->sk;
2309
2310         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2311                 __sk_free(sk);
2312 }
2313
2314 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2315 {
2316         skb_orphan(skb);
2317         skb->sk = sk;
2318 #ifdef CONFIG_INET
2319         if (unlikely(!sk_fullsock(sk))) {
2320                 skb->destructor = sock_edemux;
2321                 sock_hold(sk);
2322                 return;
2323         }
2324 #endif
2325         skb->destructor = sock_wfree;
2326         skb_set_hash_from_sk(skb, sk);
2327         /*
2328          * We used to take a refcount on sk, but following operation
2329          * is enough to guarantee sk_free() wont free this sock until
2330          * all in-flight packets are completed
2331          */
2332         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2333 }
2334 EXPORT_SYMBOL(skb_set_owner_w);
2335
2336 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2337 {
2338 #ifdef CONFIG_TLS_DEVICE
2339         /* Drivers depend on in-order delivery for crypto offload,
2340          * partial orphan breaks out-of-order-OK logic.
2341          */
2342         if (skb->decrypted)
2343                 return false;
2344 #endif
2345         return (skb->destructor == sock_wfree ||
2346                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2347 }
2348
2349 /* This helper is used by netem, as it can hold packets in its
2350  * delay queue. We want to allow the owner socket to send more
2351  * packets, as if they were already TX completed by a typical driver.
2352  * But we also want to keep skb->sk set because some packet schedulers
2353  * rely on it (sch_fq for example).
2354  */
2355 void skb_orphan_partial(struct sk_buff *skb)
2356 {
2357         if (skb_is_tcp_pure_ack(skb))
2358                 return;
2359
2360         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2361                 return;
2362
2363         skb_orphan(skb);
2364 }
2365 EXPORT_SYMBOL(skb_orphan_partial);
2366
2367 /*
2368  * Read buffer destructor automatically called from kfree_skb.
2369  */
2370 void sock_rfree(struct sk_buff *skb)
2371 {
2372         struct sock *sk = skb->sk;
2373         unsigned int len = skb->truesize;
2374
2375         atomic_sub(len, &sk->sk_rmem_alloc);
2376         sk_mem_uncharge(sk, len);
2377 }
2378 EXPORT_SYMBOL(sock_rfree);
2379
2380 /*
2381  * Buffer destructor for skbs that are not used directly in read or write
2382  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2383  */
2384 void sock_efree(struct sk_buff *skb)
2385 {
2386         sock_put(skb->sk);
2387 }
2388 EXPORT_SYMBOL(sock_efree);
2389
2390 /* Buffer destructor for prefetch/receive path where reference count may
2391  * not be held, e.g. for listen sockets.
2392  */
2393 #ifdef CONFIG_INET
2394 void sock_pfree(struct sk_buff *skb)
2395 {
2396         if (sk_is_refcounted(skb->sk))
2397                 sock_gen_put(skb->sk);
2398 }
2399 EXPORT_SYMBOL(sock_pfree);
2400 #endif /* CONFIG_INET */
2401
2402 kuid_t sock_i_uid(struct sock *sk)
2403 {
2404         kuid_t uid;
2405
2406         read_lock_bh(&sk->sk_callback_lock);
2407         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2408         read_unlock_bh(&sk->sk_callback_lock);
2409         return uid;
2410 }
2411 EXPORT_SYMBOL(sock_i_uid);
2412
2413 unsigned long sock_i_ino(struct sock *sk)
2414 {
2415         unsigned long ino;
2416
2417         read_lock_bh(&sk->sk_callback_lock);
2418         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2419         read_unlock_bh(&sk->sk_callback_lock);
2420         return ino;
2421 }
2422 EXPORT_SYMBOL(sock_i_ino);
2423
2424 /*
2425  * Allocate a skb from the socket's send buffer.
2426  */
2427 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2428                              gfp_t priority)
2429 {
2430         if (force ||
2431             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2432                 struct sk_buff *skb = alloc_skb(size, priority);
2433
2434                 if (skb) {
2435                         skb_set_owner_w(skb, sk);
2436                         return skb;
2437                 }
2438         }
2439         return NULL;
2440 }
2441 EXPORT_SYMBOL(sock_wmalloc);
2442
2443 static void sock_ofree(struct sk_buff *skb)
2444 {
2445         struct sock *sk = skb->sk;
2446
2447         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2448 }
2449
2450 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2451                              gfp_t priority)
2452 {
2453         struct sk_buff *skb;
2454
2455         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2456         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2457             sysctl_optmem_max)
2458                 return NULL;
2459
2460         skb = alloc_skb(size, priority);
2461         if (!skb)
2462                 return NULL;
2463
2464         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2465         skb->sk = sk;
2466         skb->destructor = sock_ofree;
2467         return skb;
2468 }
2469
2470 /*
2471  * Allocate a memory block from the socket's option memory buffer.
2472  */
2473 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2474 {
2475         if ((unsigned int)size <= sysctl_optmem_max &&
2476             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2477                 void *mem;
2478                 /* First do the add, to avoid the race if kmalloc
2479                  * might sleep.
2480                  */
2481                 atomic_add(size, &sk->sk_omem_alloc);
2482                 mem = kmalloc(size, priority);
2483                 if (mem)
2484                         return mem;
2485                 atomic_sub(size, &sk->sk_omem_alloc);
2486         }
2487         return NULL;
2488 }
2489 EXPORT_SYMBOL(sock_kmalloc);
2490
2491 /* Free an option memory block. Note, we actually want the inline
2492  * here as this allows gcc to detect the nullify and fold away the
2493  * condition entirely.
2494  */
2495 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2496                                   const bool nullify)
2497 {
2498         if (WARN_ON_ONCE(!mem))
2499                 return;
2500         if (nullify)
2501                 kfree_sensitive(mem);
2502         else
2503                 kfree(mem);
2504         atomic_sub(size, &sk->sk_omem_alloc);
2505 }
2506
2507 void sock_kfree_s(struct sock *sk, void *mem, int size)
2508 {
2509         __sock_kfree_s(sk, mem, size, false);
2510 }
2511 EXPORT_SYMBOL(sock_kfree_s);
2512
2513 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2514 {
2515         __sock_kfree_s(sk, mem, size, true);
2516 }
2517 EXPORT_SYMBOL(sock_kzfree_s);
2518
2519 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2520    I think, these locks should be removed for datagram sockets.
2521  */
2522 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2523 {
2524         DEFINE_WAIT(wait);
2525
2526         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2527         for (;;) {
2528                 if (!timeo)
2529                         break;
2530                 if (signal_pending(current))
2531                         break;
2532                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2533                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2534                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2535                         break;
2536                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2537                         break;
2538                 if (sk->sk_err)
2539                         break;
2540                 timeo = schedule_timeout(timeo);
2541         }
2542         finish_wait(sk_sleep(sk), &wait);
2543         return timeo;
2544 }
2545
2546
2547 /*
2548  *      Generic send/receive buffer handlers
2549  */
2550
2551 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2552                                      unsigned long data_len, int noblock,
2553                                      int *errcode, int max_page_order)
2554 {
2555         struct sk_buff *skb;
2556         long timeo;
2557         int err;
2558
2559         timeo = sock_sndtimeo(sk, noblock);
2560         for (;;) {
2561                 err = sock_error(sk);
2562                 if (err != 0)
2563                         goto failure;
2564
2565                 err = -EPIPE;
2566                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2567                         goto failure;
2568
2569                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2570                         break;
2571
2572                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2573                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2574                 err = -EAGAIN;
2575                 if (!timeo)
2576                         goto failure;
2577                 if (signal_pending(current))
2578                         goto interrupted;
2579                 timeo = sock_wait_for_wmem(sk, timeo);
2580         }
2581         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2582                                    errcode, sk->sk_allocation);
2583         if (skb)
2584                 skb_set_owner_w(skb, sk);
2585         return skb;
2586
2587 interrupted:
2588         err = sock_intr_errno(timeo);
2589 failure:
2590         *errcode = err;
2591         return NULL;
2592 }
2593 EXPORT_SYMBOL(sock_alloc_send_pskb);
2594
2595 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2596                                     int noblock, int *errcode)
2597 {
2598         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2599 }
2600 EXPORT_SYMBOL(sock_alloc_send_skb);
2601
2602 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2603                      struct sockcm_cookie *sockc)
2604 {
2605         u32 tsflags;
2606
2607         switch (cmsg->cmsg_type) {
2608         case SO_MARK:
2609                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2610                         return -EPERM;
2611                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2612                         return -EINVAL;
2613                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2614                 break;
2615         case SO_TIMESTAMPING_OLD:
2616                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2617                         return -EINVAL;
2618
2619                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2620                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2621                         return -EINVAL;
2622
2623                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2624                 sockc->tsflags |= tsflags;
2625                 break;
2626         case SCM_TXTIME:
2627                 if (!sock_flag(sk, SOCK_TXTIME))
2628                         return -EINVAL;
2629                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2630                         return -EINVAL;
2631                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2632                 break;
2633         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2634         case SCM_RIGHTS:
2635         case SCM_CREDENTIALS:
2636                 break;
2637         default:
2638                 return -EINVAL;
2639         }
2640         return 0;
2641 }
2642 EXPORT_SYMBOL(__sock_cmsg_send);
2643
2644 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2645                    struct sockcm_cookie *sockc)
2646 {
2647         struct cmsghdr *cmsg;
2648         int ret;
2649
2650         for_each_cmsghdr(cmsg, msg) {
2651                 if (!CMSG_OK(msg, cmsg))
2652                         return -EINVAL;
2653                 if (cmsg->cmsg_level != SOL_SOCKET)
2654                         continue;
2655                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2656                 if (ret)
2657                         return ret;
2658         }
2659         return 0;
2660 }
2661 EXPORT_SYMBOL(sock_cmsg_send);
2662
2663 static void sk_enter_memory_pressure(struct sock *sk)
2664 {
2665         if (!sk->sk_prot->enter_memory_pressure)
2666                 return;
2667
2668         sk->sk_prot->enter_memory_pressure(sk);
2669 }
2670
2671 static void sk_leave_memory_pressure(struct sock *sk)
2672 {
2673         if (sk->sk_prot->leave_memory_pressure) {
2674                 sk->sk_prot->leave_memory_pressure(sk);
2675         } else {
2676                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2677
2678                 if (memory_pressure && READ_ONCE(*memory_pressure))
2679                         WRITE_ONCE(*memory_pressure, 0);
2680         }
2681 }
2682
2683 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2684
2685 /**
2686  * skb_page_frag_refill - check that a page_frag contains enough room
2687  * @sz: minimum size of the fragment we want to get
2688  * @pfrag: pointer to page_frag
2689  * @gfp: priority for memory allocation
2690  *
2691  * Note: While this allocator tries to use high order pages, there is
2692  * no guarantee that allocations succeed. Therefore, @sz MUST be
2693  * less or equal than PAGE_SIZE.
2694  */
2695 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2696 {
2697         if (pfrag->page) {
2698                 if (page_ref_count(pfrag->page) == 1) {
2699                         pfrag->offset = 0;
2700                         return true;
2701                 }
2702                 if (pfrag->offset + sz <= pfrag->size)
2703                         return true;
2704                 put_page(pfrag->page);
2705         }
2706
2707         pfrag->offset = 0;
2708         if (SKB_FRAG_PAGE_ORDER &&
2709             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2710                 /* Avoid direct reclaim but allow kswapd to wake */
2711                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2712                                           __GFP_COMP | __GFP_NOWARN |
2713                                           __GFP_NORETRY,
2714                                           SKB_FRAG_PAGE_ORDER);
2715                 if (likely(pfrag->page)) {
2716                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2717                         return true;
2718                 }
2719         }
2720         pfrag->page = alloc_page(gfp);
2721         if (likely(pfrag->page)) {
2722                 pfrag->size = PAGE_SIZE;
2723                 return true;
2724         }
2725         return false;
2726 }
2727 EXPORT_SYMBOL(skb_page_frag_refill);
2728
2729 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2730 {
2731         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2732                 return true;
2733
2734         sk_enter_memory_pressure(sk);
2735         sk_stream_moderate_sndbuf(sk);
2736         return false;
2737 }
2738 EXPORT_SYMBOL(sk_page_frag_refill);
2739
2740 void __lock_sock(struct sock *sk)
2741         __releases(&sk->sk_lock.slock)
2742         __acquires(&sk->sk_lock.slock)
2743 {
2744         DEFINE_WAIT(wait);
2745
2746         for (;;) {
2747                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2748                                         TASK_UNINTERRUPTIBLE);
2749                 spin_unlock_bh(&sk->sk_lock.slock);
2750                 schedule();
2751                 spin_lock_bh(&sk->sk_lock.slock);
2752                 if (!sock_owned_by_user(sk))
2753                         break;
2754         }
2755         finish_wait(&sk->sk_lock.wq, &wait);
2756 }
2757
2758 void __release_sock(struct sock *sk)
2759         __releases(&sk->sk_lock.slock)
2760         __acquires(&sk->sk_lock.slock)
2761 {
2762         struct sk_buff *skb, *next;
2763
2764         while ((skb = sk->sk_backlog.head) != NULL) {
2765                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2766
2767                 spin_unlock_bh(&sk->sk_lock.slock);
2768
2769                 do {
2770                         next = skb->next;
2771                         prefetch(next);
2772                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2773                         skb_mark_not_on_list(skb);
2774                         sk_backlog_rcv(sk, skb);
2775
2776                         cond_resched();
2777
2778                         skb = next;
2779                 } while (skb != NULL);
2780
2781                 spin_lock_bh(&sk->sk_lock.slock);
2782         }
2783
2784         /*
2785          * Doing the zeroing here guarantee we can not loop forever
2786          * while a wild producer attempts to flood us.
2787          */
2788         sk->sk_backlog.len = 0;
2789 }
2790
2791 void __sk_flush_backlog(struct sock *sk)
2792 {
2793         spin_lock_bh(&sk->sk_lock.slock);
2794         __release_sock(sk);
2795         spin_unlock_bh(&sk->sk_lock.slock);
2796 }
2797
2798 /**
2799  * sk_wait_data - wait for data to arrive at sk_receive_queue
2800  * @sk:    sock to wait on
2801  * @timeo: for how long
2802  * @skb:   last skb seen on sk_receive_queue
2803  *
2804  * Now socket state including sk->sk_err is changed only under lock,
2805  * hence we may omit checks after joining wait queue.
2806  * We check receive queue before schedule() only as optimization;
2807  * it is very likely that release_sock() added new data.
2808  */
2809 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2810 {
2811         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2812         int rc;
2813
2814         add_wait_queue(sk_sleep(sk), &wait);
2815         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2816         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2817         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2818         remove_wait_queue(sk_sleep(sk), &wait);
2819         return rc;
2820 }
2821 EXPORT_SYMBOL(sk_wait_data);
2822
2823 /**
2824  *      __sk_mem_raise_allocated - increase memory_allocated
2825  *      @sk: socket
2826  *      @size: memory size to allocate
2827  *      @amt: pages to allocate
2828  *      @kind: allocation type
2829  *
2830  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2831  */
2832 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2833 {
2834         struct proto *prot = sk->sk_prot;
2835         long allocated = sk_memory_allocated_add(sk, amt);
2836         bool memcg_charge = mem_cgroup_sockets_enabled && sk->sk_memcg;
2837         bool charged = true;
2838
2839         if (memcg_charge &&
2840             !(charged = mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2841                                                 gfp_memcg_charge())))
2842                 goto suppress_allocation;
2843
2844         /* Under limit. */
2845         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2846                 sk_leave_memory_pressure(sk);
2847                 return 1;
2848         }
2849
2850         /* Under pressure. */
2851         if (allocated > sk_prot_mem_limits(sk, 1))
2852                 sk_enter_memory_pressure(sk);
2853
2854         /* Over hard limit. */
2855         if (allocated > sk_prot_mem_limits(sk, 2))
2856                 goto suppress_allocation;
2857
2858         /* guarantee minimum buffer size under pressure */
2859         if (kind == SK_MEM_RECV) {
2860                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2861                         return 1;
2862
2863         } else { /* SK_MEM_SEND */
2864                 int wmem0 = sk_get_wmem0(sk, prot);
2865
2866                 if (sk->sk_type == SOCK_STREAM) {
2867                         if (sk->sk_wmem_queued < wmem0)
2868                                 return 1;
2869                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2870                                 return 1;
2871                 }
2872         }
2873
2874         if (sk_has_memory_pressure(sk)) {
2875                 u64 alloc;
2876
2877                 if (!sk_under_memory_pressure(sk))
2878                         return 1;
2879                 alloc = sk_sockets_allocated_read_positive(sk);
2880                 if (sk_prot_mem_limits(sk, 2) > alloc *
2881                     sk_mem_pages(sk->sk_wmem_queued +
2882                                  atomic_read(&sk->sk_rmem_alloc) +
2883                                  sk->sk_forward_alloc))
2884                         return 1;
2885         }
2886
2887 suppress_allocation:
2888
2889         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2890                 sk_stream_moderate_sndbuf(sk);
2891
2892                 /* Fail only if socket is _under_ its sndbuf.
2893                  * In this case we cannot block, so that we have to fail.
2894                  */
2895                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
2896                         /* Force charge with __GFP_NOFAIL */
2897                         if (memcg_charge && !charged) {
2898                                 mem_cgroup_charge_skmem(sk->sk_memcg, amt,
2899                                         gfp_memcg_charge() | __GFP_NOFAIL);
2900                         }
2901                         return 1;
2902                 }
2903         }
2904
2905         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
2906                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
2907
2908         sk_memory_allocated_sub(sk, amt);
2909
2910         if (memcg_charge && charged)
2911                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2912
2913         return 0;
2914 }
2915 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2916
2917 /**
2918  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2919  *      @sk: socket
2920  *      @size: memory size to allocate
2921  *      @kind: allocation type
2922  *
2923  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2924  *      rmem allocation. This function assumes that protocols which have
2925  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2926  */
2927 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2928 {
2929         int ret, amt = sk_mem_pages(size);
2930
2931         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2932         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2933         if (!ret)
2934                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2935         return ret;
2936 }
2937 EXPORT_SYMBOL(__sk_mem_schedule);
2938
2939 /**
2940  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2941  *      @sk: socket
2942  *      @amount: number of quanta
2943  *
2944  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2945  */
2946 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2947 {
2948         sk_memory_allocated_sub(sk, amount);
2949
2950         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2951                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2952
2953         if (sk_under_memory_pressure(sk) &&
2954             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2955                 sk_leave_memory_pressure(sk);
2956 }
2957 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2958
2959 /**
2960  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2961  *      @sk: socket
2962  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2963  */
2964 void __sk_mem_reclaim(struct sock *sk, int amount)
2965 {
2966         amount >>= SK_MEM_QUANTUM_SHIFT;
2967         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2968         __sk_mem_reduce_allocated(sk, amount);
2969 }
2970 EXPORT_SYMBOL(__sk_mem_reclaim);
2971
2972 int sk_set_peek_off(struct sock *sk, int val)
2973 {
2974         sk->sk_peek_off = val;
2975         return 0;
2976 }
2977 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2978
2979 /*
2980  * Set of default routines for initialising struct proto_ops when
2981  * the protocol does not support a particular function. In certain
2982  * cases where it makes no sense for a protocol to have a "do nothing"
2983  * function, some default processing is provided.
2984  */
2985
2986 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2987 {
2988         return -EOPNOTSUPP;
2989 }
2990 EXPORT_SYMBOL(sock_no_bind);
2991
2992 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2993                     int len, int flags)
2994 {
2995         return -EOPNOTSUPP;
2996 }
2997 EXPORT_SYMBOL(sock_no_connect);
2998
2999 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3000 {
3001         return -EOPNOTSUPP;
3002 }
3003 EXPORT_SYMBOL(sock_no_socketpair);
3004
3005 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3006                    bool kern)
3007 {
3008         return -EOPNOTSUPP;
3009 }
3010 EXPORT_SYMBOL(sock_no_accept);
3011
3012 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3013                     int peer)
3014 {
3015         return -EOPNOTSUPP;
3016 }
3017 EXPORT_SYMBOL(sock_no_getname);
3018
3019 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3020 {
3021         return -EOPNOTSUPP;
3022 }
3023 EXPORT_SYMBOL(sock_no_ioctl);
3024
3025 int sock_no_listen(struct socket *sock, int backlog)
3026 {
3027         return -EOPNOTSUPP;
3028 }
3029 EXPORT_SYMBOL(sock_no_listen);
3030
3031 int sock_no_shutdown(struct socket *sock, int how)
3032 {
3033         return -EOPNOTSUPP;
3034 }
3035 EXPORT_SYMBOL(sock_no_shutdown);
3036
3037 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3038 {
3039         return -EOPNOTSUPP;
3040 }
3041 EXPORT_SYMBOL(sock_no_sendmsg);
3042
3043 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3044 {
3045         return -EOPNOTSUPP;
3046 }
3047 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3048
3049 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3050                     int flags)
3051 {
3052         return -EOPNOTSUPP;
3053 }
3054 EXPORT_SYMBOL(sock_no_recvmsg);
3055
3056 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3057 {
3058         /* Mirror missing mmap method error code */
3059         return -ENODEV;
3060 }
3061 EXPORT_SYMBOL(sock_no_mmap);
3062
3063 /*
3064  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3065  * various sock-based usage counts.
3066  */
3067 void __receive_sock(struct file *file)
3068 {
3069         struct socket *sock;
3070
3071         sock = sock_from_file(file);
3072         if (sock) {
3073                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3074                 sock_update_classid(&sock->sk->sk_cgrp_data);
3075         }
3076 }
3077
3078 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
3079 {
3080         ssize_t res;
3081         struct msghdr msg = {.msg_flags = flags};
3082         struct kvec iov;
3083         char *kaddr = kmap(page);
3084         iov.iov_base = kaddr + offset;
3085         iov.iov_len = size;
3086         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
3087         kunmap(page);
3088         return res;
3089 }
3090 EXPORT_SYMBOL(sock_no_sendpage);
3091
3092 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
3093                                 int offset, size_t size, int flags)
3094 {
3095         ssize_t res;
3096         struct msghdr msg = {.msg_flags = flags};
3097         struct kvec iov;
3098         char *kaddr = kmap(page);
3099
3100         iov.iov_base = kaddr + offset;
3101         iov.iov_len = size;
3102         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
3103         kunmap(page);
3104         return res;
3105 }
3106 EXPORT_SYMBOL(sock_no_sendpage_locked);
3107
3108 /*
3109  *      Default Socket Callbacks
3110  */
3111
3112 static void sock_def_wakeup(struct sock *sk)
3113 {
3114         struct socket_wq *wq;
3115
3116         rcu_read_lock();
3117         wq = rcu_dereference(sk->sk_wq);
3118         if (skwq_has_sleeper(wq))
3119                 wake_up_interruptible_all(&wq->wait);
3120         rcu_read_unlock();
3121 }
3122
3123 static void sock_def_error_report(struct sock *sk)
3124 {
3125         struct socket_wq *wq;
3126
3127         rcu_read_lock();
3128         wq = rcu_dereference(sk->sk_wq);
3129         if (skwq_has_sleeper(wq))
3130                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3131         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3132         rcu_read_unlock();
3133 }
3134
3135 void sock_def_readable(struct sock *sk)
3136 {
3137         struct socket_wq *wq;
3138
3139         rcu_read_lock();
3140         wq = rcu_dereference(sk->sk_wq);
3141         if (skwq_has_sleeper(wq))
3142                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3143                                                 EPOLLRDNORM | EPOLLRDBAND);
3144         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3145         rcu_read_unlock();
3146 }
3147
3148 static void sock_def_write_space(struct sock *sk)
3149 {
3150         struct socket_wq *wq;
3151
3152         rcu_read_lock();
3153
3154         /* Do not wake up a writer until he can make "significant"
3155          * progress.  --DaveM
3156          */
3157         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= READ_ONCE(sk->sk_sndbuf)) {
3158                 wq = rcu_dereference(sk->sk_wq);
3159                 if (skwq_has_sleeper(wq))
3160                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3161                                                 EPOLLWRNORM | EPOLLWRBAND);
3162
3163                 /* Should agree with poll, otherwise some programs break */
3164                 if (sock_writeable(sk))
3165                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3166         }
3167
3168         rcu_read_unlock();
3169 }
3170
3171 static void sock_def_destruct(struct sock *sk)
3172 {
3173 }
3174
3175 void sk_send_sigurg(struct sock *sk)
3176 {
3177         if (sk->sk_socket && sk->sk_socket->file)
3178                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3179                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3180 }
3181 EXPORT_SYMBOL(sk_send_sigurg);
3182
3183 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3184                     unsigned long expires)
3185 {
3186         if (!mod_timer(timer, expires))
3187                 sock_hold(sk);
3188 }
3189 EXPORT_SYMBOL(sk_reset_timer);
3190
3191 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3192 {
3193         if (del_timer(timer))
3194                 __sock_put(sk);
3195 }
3196 EXPORT_SYMBOL(sk_stop_timer);
3197
3198 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3199 {
3200         if (del_timer_sync(timer))
3201                 __sock_put(sk);
3202 }
3203 EXPORT_SYMBOL(sk_stop_timer_sync);
3204
3205 void sock_init_data(struct socket *sock, struct sock *sk)
3206 {
3207         sk_init_common(sk);
3208         sk->sk_send_head        =       NULL;
3209
3210         timer_setup(&sk->sk_timer, NULL, 0);
3211
3212         sk->sk_allocation       =       GFP_KERNEL;
3213         sk->sk_rcvbuf           =       sysctl_rmem_default;
3214         sk->sk_sndbuf           =       sysctl_wmem_default;
3215         sk->sk_state            =       TCP_CLOSE;
3216         sk_set_socket(sk, sock);
3217
3218         sock_set_flag(sk, SOCK_ZAPPED);
3219
3220         if (sock) {
3221                 sk->sk_type     =       sock->type;
3222                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3223                 sock->sk        =       sk;
3224                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
3225         } else {
3226                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3227                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
3228         }
3229
3230         rwlock_init(&sk->sk_callback_lock);
3231         if (sk->sk_kern_sock)
3232                 lockdep_set_class_and_name(
3233                         &sk->sk_callback_lock,
3234                         af_kern_callback_keys + sk->sk_family,
3235                         af_family_kern_clock_key_strings[sk->sk_family]);
3236         else
3237                 lockdep_set_class_and_name(
3238                         &sk->sk_callback_lock,
3239                         af_callback_keys + sk->sk_family,
3240                         af_family_clock_key_strings[sk->sk_family]);
3241
3242         sk->sk_state_change     =       sock_def_wakeup;
3243         sk->sk_data_ready       =       sock_def_readable;
3244         sk->sk_write_space      =       sock_def_write_space;
3245         sk->sk_error_report     =       sock_def_error_report;
3246         sk->sk_destruct         =       sock_def_destruct;
3247
3248         sk->sk_frag.page        =       NULL;
3249         sk->sk_frag.offset      =       0;
3250         sk->sk_peek_off         =       -1;
3251
3252         sk->sk_peer_pid         =       NULL;
3253         sk->sk_peer_cred        =       NULL;
3254         spin_lock_init(&sk->sk_peer_lock);
3255
3256         sk->sk_write_pending    =       0;
3257         sk->sk_rcvlowat         =       1;
3258         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3259         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3260
3261         sk->sk_stamp = SK_DEFAULT_STAMP;
3262 #if BITS_PER_LONG==32
3263         seqlock_init(&sk->sk_stamp_seq);
3264 #endif
3265         atomic_set(&sk->sk_zckey, 0);
3266
3267 #ifdef CONFIG_NET_RX_BUSY_POLL
3268         sk->sk_napi_id          =       0;
3269         sk->sk_ll_usec          =       sysctl_net_busy_read;
3270 #endif
3271
3272         sk->sk_max_pacing_rate = ~0UL;
3273         sk->sk_pacing_rate = ~0UL;
3274         WRITE_ONCE(sk->sk_pacing_shift, 10);
3275         sk->sk_incoming_cpu = -1;
3276
3277         sk_rx_queue_clear(sk);
3278         /*
3279          * Before updating sk_refcnt, we must commit prior changes to memory
3280          * (Documentation/RCU/rculist_nulls.rst for details)
3281          */
3282         smp_wmb();
3283         refcount_set(&sk->sk_refcnt, 1);
3284         atomic_set(&sk->sk_drops, 0);
3285 }
3286 EXPORT_SYMBOL(sock_init_data);
3287
3288 void lock_sock_nested(struct sock *sk, int subclass)
3289 {
3290         /* The sk_lock has mutex_lock() semantics here. */
3291         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3292
3293         might_sleep();
3294         spin_lock_bh(&sk->sk_lock.slock);
3295         if (sock_owned_by_user_nocheck(sk))
3296                 __lock_sock(sk);
3297         sk->sk_lock.owned = 1;
3298         spin_unlock_bh(&sk->sk_lock.slock);
3299 }
3300 EXPORT_SYMBOL(lock_sock_nested);
3301
3302 void release_sock(struct sock *sk)
3303 {
3304         spin_lock_bh(&sk->sk_lock.slock);
3305         if (sk->sk_backlog.tail)
3306                 __release_sock(sk);
3307
3308         /* Warning : release_cb() might need to release sk ownership,
3309          * ie call sock_release_ownership(sk) before us.
3310          */
3311         if (sk->sk_prot->release_cb)
3312                 sk->sk_prot->release_cb(sk);
3313
3314         sock_release_ownership(sk);
3315         if (waitqueue_active(&sk->sk_lock.wq))
3316                 wake_up(&sk->sk_lock.wq);
3317         spin_unlock_bh(&sk->sk_lock.slock);
3318 }
3319 EXPORT_SYMBOL(release_sock);
3320
3321 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3322 {
3323         might_sleep();
3324         spin_lock_bh(&sk->sk_lock.slock);
3325
3326         if (!sock_owned_by_user_nocheck(sk)) {
3327                 /*
3328                  * Fast path return with bottom halves disabled and
3329                  * sock::sk_lock.slock held.
3330                  *
3331                  * The 'mutex' is not contended and holding
3332                  * sock::sk_lock.slock prevents all other lockers to
3333                  * proceed so the corresponding unlock_sock_fast() can
3334                  * avoid the slow path of release_sock() completely and
3335                  * just release slock.
3336                  *
3337                  * From a semantical POV this is equivalent to 'acquiring'
3338                  * the 'mutex', hence the corresponding lockdep
3339                  * mutex_release() has to happen in the fast path of
3340                  * unlock_sock_fast().
3341                  */
3342                 return false;
3343         }
3344
3345         __lock_sock(sk);
3346         sk->sk_lock.owned = 1;
3347         __acquire(&sk->sk_lock.slock);
3348         spin_unlock_bh(&sk->sk_lock.slock);
3349         return true;
3350 }
3351 EXPORT_SYMBOL(__lock_sock_fast);
3352
3353 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3354                    bool timeval, bool time32)
3355 {
3356         struct sock *sk = sock->sk;
3357         struct timespec64 ts;
3358
3359         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3360         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3361         if (ts.tv_sec == -1)
3362                 return -ENOENT;
3363         if (ts.tv_sec == 0) {
3364                 ktime_t kt = ktime_get_real();
3365                 sock_write_timestamp(sk, kt);
3366                 ts = ktime_to_timespec64(kt);
3367         }
3368
3369         if (timeval)
3370                 ts.tv_nsec /= 1000;
3371
3372 #ifdef CONFIG_COMPAT_32BIT_TIME
3373         if (time32)
3374                 return put_old_timespec32(&ts, userstamp);
3375 #endif
3376 #ifdef CONFIG_SPARC64
3377         /* beware of padding in sparc64 timeval */
3378         if (timeval && !in_compat_syscall()) {
3379                 struct __kernel_old_timeval __user tv = {
3380                         .tv_sec = ts.tv_sec,
3381                         .tv_usec = ts.tv_nsec,
3382                 };
3383                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3384                         return -EFAULT;
3385                 return 0;
3386         }
3387 #endif
3388         return put_timespec64(&ts, userstamp);
3389 }
3390 EXPORT_SYMBOL(sock_gettstamp);
3391
3392 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3393 {
3394         if (!sock_flag(sk, flag)) {
3395                 unsigned long previous_flags = sk->sk_flags;
3396
3397                 sock_set_flag(sk, flag);
3398                 /*
3399                  * we just set one of the two flags which require net
3400                  * time stamping, but time stamping might have been on
3401                  * already because of the other one
3402                  */
3403                 if (sock_needs_netstamp(sk) &&
3404                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3405                         net_enable_timestamp();
3406         }
3407 }
3408
3409 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3410                        int level, int type)
3411 {
3412         struct sock_exterr_skb *serr;
3413         struct sk_buff *skb;
3414         int copied, err;
3415
3416         err = -EAGAIN;
3417         skb = sock_dequeue_err_skb(sk);
3418         if (skb == NULL)
3419                 goto out;
3420
3421         copied = skb->len;
3422         if (copied > len) {
3423                 msg->msg_flags |= MSG_TRUNC;
3424                 copied = len;
3425         }
3426         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3427         if (err)
3428                 goto out_free_skb;
3429
3430         sock_recv_timestamp(msg, sk, skb);
3431
3432         serr = SKB_EXT_ERR(skb);
3433         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3434
3435         msg->msg_flags |= MSG_ERRQUEUE;
3436         err = copied;
3437
3438 out_free_skb:
3439         kfree_skb(skb);
3440 out:
3441         return err;
3442 }
3443 EXPORT_SYMBOL(sock_recv_errqueue);
3444
3445 /*
3446  *      Get a socket option on an socket.
3447  *
3448  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3449  *      asynchronous errors should be reported by getsockopt. We assume
3450  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3451  */
3452 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3453                            char __user *optval, int __user *optlen)
3454 {
3455         struct sock *sk = sock->sk;
3456
3457         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3458 }
3459 EXPORT_SYMBOL(sock_common_getsockopt);
3460
3461 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3462                         int flags)
3463 {
3464         struct sock *sk = sock->sk;
3465         int addr_len = 0;
3466         int err;
3467
3468         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3469                                    flags & ~MSG_DONTWAIT, &addr_len);
3470         if (err >= 0)
3471                 msg->msg_namelen = addr_len;
3472         return err;
3473 }
3474 EXPORT_SYMBOL(sock_common_recvmsg);
3475
3476 /*
3477  *      Set socket options on an inet socket.
3478  */
3479 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3480                            sockptr_t optval, unsigned int optlen)
3481 {
3482         struct sock *sk = sock->sk;
3483
3484         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3485 }
3486 EXPORT_SYMBOL(sock_common_setsockopt);
3487
3488 void sk_common_release(struct sock *sk)
3489 {
3490         if (sk->sk_prot->destroy)
3491                 sk->sk_prot->destroy(sk);
3492
3493         /*
3494          * Observation: when sk_common_release is called, processes have
3495          * no access to socket. But net still has.
3496          * Step one, detach it from networking:
3497          *
3498          * A. Remove from hash tables.
3499          */
3500
3501         sk->sk_prot->unhash(sk);
3502
3503         /*
3504          * In this point socket cannot receive new packets, but it is possible
3505          * that some packets are in flight because some CPU runs receiver and
3506          * did hash table lookup before we unhashed socket. They will achieve
3507          * receive queue and will be purged by socket destructor.
3508          *
3509          * Also we still have packets pending on receive queue and probably,
3510          * our own packets waiting in device queues. sock_destroy will drain
3511          * receive queue, but transmitted packets will delay socket destruction
3512          * until the last reference will be released.
3513          */
3514
3515         sock_orphan(sk);
3516
3517         xfrm_sk_free_policy(sk);
3518
3519         sk_refcnt_debug_release(sk);
3520
3521         sock_put(sk);
3522 }
3523 EXPORT_SYMBOL(sk_common_release);
3524
3525 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3526 {
3527         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3528
3529         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3530         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3531         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3532         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3533         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3534         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3535         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3536         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3537         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3538 }
3539
3540 #ifdef CONFIG_PROC_FS
3541 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3542
3543 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3544 {
3545         int cpu, idx = prot->inuse_idx;
3546         int res = 0;
3547
3548         for_each_possible_cpu(cpu)
3549                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3550
3551         return res >= 0 ? res : 0;
3552 }
3553 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3554
3555 int sock_inuse_get(struct net *net)
3556 {
3557         int cpu, res = 0;
3558
3559         for_each_possible_cpu(cpu)
3560                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3561
3562         return res;
3563 }
3564
3565 EXPORT_SYMBOL_GPL(sock_inuse_get);
3566
3567 static int __net_init sock_inuse_init_net(struct net *net)
3568 {
3569         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3570         if (net->core.prot_inuse == NULL)
3571                 return -ENOMEM;
3572         return 0;
3573 }
3574
3575 static void __net_exit sock_inuse_exit_net(struct net *net)
3576 {
3577         free_percpu(net->core.prot_inuse);
3578 }
3579
3580 static struct pernet_operations net_inuse_ops = {
3581         .init = sock_inuse_init_net,
3582         .exit = sock_inuse_exit_net,
3583 };
3584
3585 static __init int net_inuse_init(void)
3586 {
3587         if (register_pernet_subsys(&net_inuse_ops))
3588                 panic("Cannot initialize net inuse counters");
3589
3590         return 0;
3591 }
3592
3593 core_initcall(net_inuse_init);
3594
3595 static int assign_proto_idx(struct proto *prot)
3596 {
3597         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3598
3599         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3600                 pr_err("PROTO_INUSE_NR exhausted\n");
3601                 return -ENOSPC;
3602         }
3603
3604         set_bit(prot->inuse_idx, proto_inuse_idx);
3605         return 0;
3606 }
3607
3608 static void release_proto_idx(struct proto *prot)
3609 {
3610         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3611                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3612 }
3613 #else
3614 static inline int assign_proto_idx(struct proto *prot)
3615 {
3616         return 0;
3617 }
3618
3619 static inline void release_proto_idx(struct proto *prot)
3620 {
3621 }
3622
3623 #endif
3624
3625 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3626 {
3627         if (!twsk_prot)
3628                 return;
3629         kfree(twsk_prot->twsk_slab_name);
3630         twsk_prot->twsk_slab_name = NULL;
3631         kmem_cache_destroy(twsk_prot->twsk_slab);
3632         twsk_prot->twsk_slab = NULL;
3633 }
3634
3635 static int tw_prot_init(const struct proto *prot)
3636 {
3637         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3638
3639         if (!twsk_prot)
3640                 return 0;
3641
3642         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3643                                               prot->name);
3644         if (!twsk_prot->twsk_slab_name)
3645                 return -ENOMEM;
3646
3647         twsk_prot->twsk_slab =
3648                 kmem_cache_create(twsk_prot->twsk_slab_name,
3649                                   twsk_prot->twsk_obj_size, 0,
3650                                   SLAB_ACCOUNT | prot->slab_flags,
3651                                   NULL);
3652         if (!twsk_prot->twsk_slab) {
3653                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3654                         prot->name);
3655                 return -ENOMEM;
3656         }
3657
3658         return 0;
3659 }
3660
3661 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3662 {
3663         if (!rsk_prot)
3664                 return;
3665         kfree(rsk_prot->slab_name);
3666         rsk_prot->slab_name = NULL;
3667         kmem_cache_destroy(rsk_prot->slab);
3668         rsk_prot->slab = NULL;
3669 }
3670
3671 static int req_prot_init(const struct proto *prot)
3672 {
3673         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3674
3675         if (!rsk_prot)
3676                 return 0;
3677
3678         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3679                                         prot->name);
3680         if (!rsk_prot->slab_name)
3681                 return -ENOMEM;
3682
3683         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3684                                            rsk_prot->obj_size, 0,
3685                                            SLAB_ACCOUNT | prot->slab_flags,
3686                                            NULL);
3687
3688         if (!rsk_prot->slab) {
3689                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3690                         prot->name);
3691                 return -ENOMEM;
3692         }
3693         return 0;
3694 }
3695
3696 int proto_register(struct proto *prot, int alloc_slab)
3697 {
3698         int ret = -ENOBUFS;
3699
3700         if (alloc_slab) {
3701                 prot->slab = kmem_cache_create_usercopy(prot->name,
3702                                         prot->obj_size, 0,
3703                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3704                                         prot->slab_flags,
3705                                         prot->useroffset, prot->usersize,
3706                                         NULL);
3707
3708                 if (prot->slab == NULL) {
3709                         pr_crit("%s: Can't create sock SLAB cache!\n",
3710                                 prot->name);
3711                         goto out;
3712                 }
3713
3714                 if (req_prot_init(prot))
3715                         goto out_free_request_sock_slab;
3716
3717                 if (tw_prot_init(prot))
3718                         goto out_free_timewait_sock_slab;
3719         }
3720
3721         mutex_lock(&proto_list_mutex);
3722         ret = assign_proto_idx(prot);
3723         if (ret) {
3724                 mutex_unlock(&proto_list_mutex);
3725                 goto out_free_timewait_sock_slab;
3726         }
3727         list_add(&prot->node, &proto_list);
3728         mutex_unlock(&proto_list_mutex);
3729         return ret;
3730
3731 out_free_timewait_sock_slab:
3732         if (alloc_slab)
3733                 tw_prot_cleanup(prot->twsk_prot);
3734 out_free_request_sock_slab:
3735         if (alloc_slab) {
3736                 req_prot_cleanup(prot->rsk_prot);
3737
3738                 kmem_cache_destroy(prot->slab);
3739                 prot->slab = NULL;
3740         }
3741 out:
3742         return ret;
3743 }
3744 EXPORT_SYMBOL(proto_register);
3745
3746 void proto_unregister(struct proto *prot)
3747 {
3748         mutex_lock(&proto_list_mutex);
3749         release_proto_idx(prot);
3750         list_del(&prot->node);
3751         mutex_unlock(&proto_list_mutex);
3752
3753         kmem_cache_destroy(prot->slab);
3754         prot->slab = NULL;
3755
3756         req_prot_cleanup(prot->rsk_prot);
3757         tw_prot_cleanup(prot->twsk_prot);
3758 }
3759 EXPORT_SYMBOL(proto_unregister);
3760
3761 int sock_load_diag_module(int family, int protocol)
3762 {
3763         if (!protocol) {
3764                 if (!sock_is_registered(family))
3765                         return -ENOENT;
3766
3767                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3768                                       NETLINK_SOCK_DIAG, family);
3769         }
3770
3771 #ifdef CONFIG_INET
3772         if (family == AF_INET &&
3773             protocol != IPPROTO_RAW &&
3774             protocol < MAX_INET_PROTOS &&
3775             !rcu_access_pointer(inet_protos[protocol]))
3776                 return -ENOENT;
3777 #endif
3778
3779         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3780                               NETLINK_SOCK_DIAG, family, protocol);
3781 }
3782 EXPORT_SYMBOL(sock_load_diag_module);
3783
3784 #ifdef CONFIG_PROC_FS
3785 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3786         __acquires(proto_list_mutex)
3787 {
3788         mutex_lock(&proto_list_mutex);
3789         return seq_list_start_head(&proto_list, *pos);
3790 }
3791
3792 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3793 {
3794         return seq_list_next(v, &proto_list, pos);
3795 }
3796
3797 static void proto_seq_stop(struct seq_file *seq, void *v)
3798         __releases(proto_list_mutex)
3799 {
3800         mutex_unlock(&proto_list_mutex);
3801 }
3802
3803 static char proto_method_implemented(const void *method)
3804 {
3805         return method == NULL ? 'n' : 'y';
3806 }
3807 static long sock_prot_memory_allocated(struct proto *proto)
3808 {
3809         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3810 }
3811
3812 static const char *sock_prot_memory_pressure(struct proto *proto)
3813 {
3814         return proto->memory_pressure != NULL ?
3815         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3816 }
3817
3818 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3819 {
3820
3821         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3822                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3823                    proto->name,
3824                    proto->obj_size,
3825                    sock_prot_inuse_get(seq_file_net(seq), proto),
3826                    sock_prot_memory_allocated(proto),
3827                    sock_prot_memory_pressure(proto),
3828                    proto->max_header,
3829                    proto->slab == NULL ? "no" : "yes",
3830                    module_name(proto->owner),
3831                    proto_method_implemented(proto->close),
3832                    proto_method_implemented(proto->connect),
3833                    proto_method_implemented(proto->disconnect),
3834                    proto_method_implemented(proto->accept),
3835                    proto_method_implemented(proto->ioctl),
3836                    proto_method_implemented(proto->init),
3837                    proto_method_implemented(proto->destroy),
3838                    proto_method_implemented(proto->shutdown),
3839                    proto_method_implemented(proto->setsockopt),
3840                    proto_method_implemented(proto->getsockopt),
3841                    proto_method_implemented(proto->sendmsg),
3842                    proto_method_implemented(proto->recvmsg),
3843                    proto_method_implemented(proto->sendpage),
3844                    proto_method_implemented(proto->bind),
3845                    proto_method_implemented(proto->backlog_rcv),
3846                    proto_method_implemented(proto->hash),
3847                    proto_method_implemented(proto->unhash),
3848                    proto_method_implemented(proto->get_port),
3849                    proto_method_implemented(proto->enter_memory_pressure));
3850 }
3851
3852 static int proto_seq_show(struct seq_file *seq, void *v)
3853 {
3854         if (v == &proto_list)
3855                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3856                            "protocol",
3857                            "size",
3858                            "sockets",
3859                            "memory",
3860                            "press",
3861                            "maxhdr",
3862                            "slab",
3863                            "module",
3864                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3865         else
3866                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3867         return 0;
3868 }
3869
3870 static const struct seq_operations proto_seq_ops = {
3871         .start  = proto_seq_start,
3872         .next   = proto_seq_next,
3873         .stop   = proto_seq_stop,
3874         .show   = proto_seq_show,
3875 };
3876
3877 static __net_init int proto_init_net(struct net *net)
3878 {
3879         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
3880                         sizeof(struct seq_net_private)))
3881                 return -ENOMEM;
3882
3883         return 0;
3884 }
3885
3886 static __net_exit void proto_exit_net(struct net *net)
3887 {
3888         remove_proc_entry("protocols", net->proc_net);
3889 }
3890
3891
3892 static __net_initdata struct pernet_operations proto_net_ops = {
3893         .init = proto_init_net,
3894         .exit = proto_exit_net,
3895 };
3896
3897 static int __init proto_init(void)
3898 {
3899         return register_pernet_subsys(&proto_net_ops);
3900 }
3901
3902 subsys_initcall(proto_init);
3903
3904 #endif /* PROC_FS */
3905
3906 #ifdef CONFIG_NET_RX_BUSY_POLL
3907 bool sk_busy_loop_end(void *p, unsigned long start_time)
3908 {
3909         struct sock *sk = p;
3910
3911         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
3912                sk_busy_loop_timeout(sk, start_time);
3913 }
3914 EXPORT_SYMBOL(sk_busy_loop_end);
3915 #endif /* CONFIG_NET_RX_BUSY_POLL */
3916
3917 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
3918 {
3919         if (!sk->sk_prot->bind_add)
3920                 return -EOPNOTSUPP;
3921         return sk->sk_prot->bind_add(sk, addr, addr_len);
3922 }
3923 EXPORT_SYMBOL(sock_bind_add);