net/core/sock.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              Generic socket support routines. Memory allocators, socket lock/release
   8  *              handler for protocols to use and generic option handler.
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  */
  85
  86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  87
  88 #include <asm/unaligned.h>
  89 #include <linux/capability.h>
  90 #include <linux/errno.h>
  91 #include <linux/errqueue.h>
  92 #include <linux/types.h>
  93 #include <linux/socket.h>
  94 #include <linux/in.h>
  95 #include <linux/kernel.h>
  96 #include <linux/module.h>
  97 #include <linux/proc_fs.h>
  98 #include <linux/seq_file.h>
  99 #include <linux/sched.h>
 100 #include <linux/sched/mm.h>
 101 #include <linux/timer.h>
 102 #include <linux/string.h>
 103 #include <linux/sockios.h>
 104 #include <linux/net.h>
 105 #include <linux/mm.h>
 106 #include <linux/slab.h>
 107 #include <linux/interrupt.h>
 108 #include <linux/poll.h>
 109 #include <linux/tcp.h>
 110 #include <linux/init.h>
 111 #include <linux/highmem.h>
 112 #include <linux/user_namespace.h>
 113 #include <linux/static_key.h>
 114 #include <linux/memcontrol.h>
 115 #include <linux/prefetch.h>
 116 #include <linux/compat.h>
 117 #include <linux/mroute.h>
 118 #include <linux/mroute6.h>
 119 #include <linux/icmpv6.h>
 120
 121 #include <linux/uaccess.h>
 122
 123 #include <linux/netdevice.h>
 124 #include <net/protocol.h>
 125 #include <linux/skbuff.h>
 126 #include <net/net_namespace.h>
 127 #include <net/request_sock.h>
 128 #include <net/sock.h>
 129 #include <linux/net_tstamp.h>
 130 #include <net/xfrm.h>
 131 #include <linux/ipsec.h>
 132 #include <net/cls_cgroup.h>
 133 #include <net/netprio_cgroup.h>
 134 #include <linux/sock_diag.h>
 135
 136 #include <linux/filter.h>
 137 #include <net/sock_reuseport.h>
 138 #include <net/bpf_sk_storage.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144 #include <net/phonet/phonet.h>
 145
 146 #include <linux/ethtool.h>
 147
 148 #include "dev.h"
 149
 150 static DEFINE_MUTEX(proto_list_mutex);
 151 static LIST_HEAD(proto_list);
 152
 153 static void sock_def_write_space_wfree(struct sock *sk);
 154 static void sock_def_write_space(struct sock *sk);
 155
 156 /**
 157  * sk_ns_capable - General socket capability test
 158  * @sk: Socket to use a capability on or through
 159  * @user_ns: The user namespace of the capability to use
 160  * @cap: The capability to use
 161  *
 162  * Test to see if the opener of the socket had when the socket was
 163  * created and the current process has the capability @cap in the user
 164  * namespace @user_ns.
 165  */
 166 bool sk_ns_capable(const struct sock *sk,
 167                    struct user_namespace *user_ns, int cap)
 168 {
 169         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 170                 ns_capable(user_ns, cap);
 171 }
 172 EXPORT_SYMBOL(sk_ns_capable);
 173
 174 /**
 175  * sk_capable - Socket global capability test
 176  * @sk: Socket to use a capability on or through
 177  * @cap: The global capability to use
 178  *
 179  * Test to see if the opener of the socket had when the socket was
 180  * created and the current process has the capability @cap in all user
 181  * namespaces.
 182  */
 183 bool sk_capable(const struct sock *sk, int cap)
 184 {
 185         return sk_ns_capable(sk, &init_user_ns, cap);
 186 }
 187 EXPORT_SYMBOL(sk_capable);
 188
 189 /**
 190  * sk_net_capable - Network namespace socket capability test
 191  * @sk: Socket to use a capability on or through
 192  * @cap: The capability to use
 193  *
 194  * Test to see if the opener of the socket had when the socket was created
 195  * and the current process has the capability @cap over the network namespace
 196  * the socket is a member of.
 197  */
 198 bool sk_net_capable(const struct sock *sk, int cap)
 199 {
 200         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 201 }
 202 EXPORT_SYMBOL(sk_net_capable);
 203
 204 /*
 205  * Each address family might have different locking rules, so we have
 206  * one slock key per address family and separate keys for internal and
 207  * userspace sockets.
 208  */
 209 static struct lock_class_key af_family_keys[AF_MAX];
 210 static struct lock_class_key af_family_kern_keys[AF_MAX];
 211 static struct lock_class_key af_family_slock_keys[AF_MAX];
 212 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 213
 214 /*
 215  * Make lock validator output more readable. (we pre-construct these
 216  * strings build-time, so that runtime initialization of socket
 217  * locks is fast):
 218  */
 219
 220 #define _sock_locks(x)                                            \
 221   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 222   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 223   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 224   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 225   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 226   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 227   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 228   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 229   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 230   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 231   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 232   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 233   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 234   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 235   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 236   x "AF_MCTP"  , \
 237   x "AF_MAX"
 238
 239 static const char *const af_family_key_strings[AF_MAX+1] = {
 240         _sock_locks("sk_lock-")
 241 };
 242 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 243         _sock_locks("slock-")
 244 };
 245 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 246         _sock_locks("clock-")
 247 };
 248
 249 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 250         _sock_locks("k-sk_lock-")
 251 };
 252 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 253         _sock_locks("k-slock-")
 254 };
 255 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 256         _sock_locks("k-clock-")
 257 };
 258 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 259         _sock_locks("rlock-")
 260 };
 261 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 262         _sock_locks("wlock-")
 263 };
 264 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 265         _sock_locks("elock-")
 266 };
 267
 268 /*
 269  * sk_callback_lock and sk queues locking rules are per-address-family,
 270  * so split the lock classes by using a per-AF key:
 271  */
 272 static struct lock_class_key af_callback_keys[AF_MAX];
 273 static struct lock_class_key af_rlock_keys[AF_MAX];
 274 static struct lock_class_key af_wlock_keys[AF_MAX];
 275 static struct lock_class_key af_elock_keys[AF_MAX];
 276 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 277
 278 /* Run time adjustable parameters. */
 279 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 280 EXPORT_SYMBOL(sysctl_wmem_max);
 281 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 282 EXPORT_SYMBOL(sysctl_rmem_max);
 283 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 284 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 285
 286 /* Maximal space eaten by iovec or ancillary data plus some space */
 287 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 288 EXPORT_SYMBOL(sysctl_optmem_max);
 289
 290 int sysctl_tstamp_allow_data __read_mostly = 1;
 291
 292 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 293 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 294
 295 /**
 296  * sk_set_memalloc - sets %SOCK_MEMALLOC
 297  * @sk: socket to set it on
 298  *
 299  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 300  * It's the responsibility of the admin to adjust min_free_kbytes
 301  * to meet the requirements
 302  */
 303 void sk_set_memalloc(struct sock *sk)
 304 {
 305         sock_set_flag(sk, SOCK_MEMALLOC);
 306         sk->sk_allocation |= __GFP_MEMALLOC;
 307         static_branch_inc(&memalloc_socks_key);
 308 }
 309 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 310
 311 void sk_clear_memalloc(struct sock *sk)
 312 {
 313         sock_reset_flag(sk, SOCK_MEMALLOC);
 314         sk->sk_allocation &= ~__GFP_MEMALLOC;
 315         static_branch_dec(&memalloc_socks_key);
 316
 317         /*
 318          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 319          * progress of swapping. SOCK_MEMALLOC may be cleared while
 320          * it has rmem allocations due to the last swapfile being deactivated
 321          * but there is a risk that the socket is unusable due to exceeding
 322          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 323          */
 324         sk_mem_reclaim(sk);
 325 }
 326 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 327
 328 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 329 {
 330         int ret;
 331         unsigned int noreclaim_flag;
 332
 333         /* these should have been dropped before queueing */
 334         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 335
 336         noreclaim_flag = memalloc_noreclaim_save();
 337         ret = INDIRECT_CALL_INET(sk->sk_backlog_rcv,
 338                                  tcp_v6_do_rcv,
 339                                  tcp_v4_do_rcv,
 340                                  sk, skb);
 341         memalloc_noreclaim_restore(noreclaim_flag);
 342
 343         return ret;
 344 }
 345 EXPORT_SYMBOL(__sk_backlog_rcv);
 346
 347 void sk_error_report(struct sock *sk)
 348 {
 349         sk->sk_error_report(sk);
 350
 351         switch (sk->sk_family) {
 352         case AF_INET:
 353                 fallthrough;
 354         case AF_INET6:
 355                 trace_inet_sk_error_report(sk);
 356                 break;
 357         default:
 358                 break;
 359         }
 360 }
 361 EXPORT_SYMBOL(sk_error_report);
 362
 363 int sock_get_timeout(long timeo, void *optval, bool old_timeval)
 364 {
 365         struct __kernel_sock_timeval tv;
 366
 367         if (timeo == MAX_SCHEDULE_TIMEOUT) {
 368                 tv.tv_sec = 0;
 369                 tv.tv_usec = 0;
 370         } else {
 371                 tv.tv_sec = timeo / HZ;
 372                 tv.tv_usec = ((timeo % HZ) * USEC_PER_SEC) / HZ;
 373         }
 374
 375         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 376                 struct old_timeval32 tv32 = { tv.tv_sec, tv.tv_usec };
 377                 *(struct old_timeval32 *)optval = tv32;
 378                 return sizeof(tv32);
 379         }
 380
 381         if (old_timeval) {
 382                 struct __kernel_old_timeval old_tv;
 383                 old_tv.tv_sec = tv.tv_sec;
 384                 old_tv.tv_usec = tv.tv_usec;
 385                 *(struct __kernel_old_timeval *)optval = old_tv;
 386                 return sizeof(old_tv);
 387         }
 388
 389         *(struct __kernel_sock_timeval *)optval = tv;
 390         return sizeof(tv);
 391 }
 392 EXPORT_SYMBOL(sock_get_timeout);
 393
 394 int sock_copy_user_timeval(struct __kernel_sock_timeval *tv,
 395                            sockptr_t optval, int optlen, bool old_timeval)
 396 {
 397         if (old_timeval && in_compat_syscall() && !COMPAT_USE_64BIT_TIME) {
 398                 struct old_timeval32 tv32;
 399
 400                 if (optlen < sizeof(tv32))
 401                         return -EINVAL;
 402
 403                 if (copy_from_sockptr(&tv32, optval, sizeof(tv32)))
 404                         return -EFAULT;
 405                 tv->tv_sec = tv32.tv_sec;
 406                 tv->tv_usec = tv32.tv_usec;
 407         } else if (old_timeval) {
 408                 struct __kernel_old_timeval old_tv;
 409
 410                 if (optlen < sizeof(old_tv))
 411                         return -EINVAL;
 412                 if (copy_from_sockptr(&old_tv, optval, sizeof(old_tv)))
 413                         return -EFAULT;
 414                 tv->tv_sec = old_tv.tv_sec;
 415                 tv->tv_usec = old_tv.tv_usec;
 416         } else {
 417                 if (optlen < sizeof(*tv))
 418                         return -EINVAL;
 419                 if (copy_from_sockptr(tv, optval, sizeof(*tv)))
 420                         return -EFAULT;
 421         }
 422
 423         return 0;
 424 }
 425 EXPORT_SYMBOL(sock_copy_user_timeval);
 426
 427 static int sock_set_timeout(long *timeo_p, sockptr_t optval, int optlen,
 428                             bool old_timeval)
 429 {
 430         struct __kernel_sock_timeval tv;
 431         int err = sock_copy_user_timeval(&tv, optval, optlen, old_timeval);
 432         long val;
 433
 434         if (err)
 435                 return err;
 436
 437         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 438                 return -EDOM;
 439
 440         if (tv.tv_sec < 0) {
 441                 static int warned __read_mostly;
 442
 443                 WRITE_ONCE(*timeo_p, 0);
 444                 if (warned < 10 && net_ratelimit()) {
 445                         warned++;
 446                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 447                                 __func__, current->comm, task_pid_nr(current));
 448                 }
 449                 return 0;
 450         }
 451         val = MAX_SCHEDULE_TIMEOUT;
 452         if ((tv.tv_sec || tv.tv_usec) &&
 453             (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT / HZ - 1)))
 454                 val = tv.tv_sec * HZ + DIV_ROUND_UP((unsigned long)tv.tv_usec,
 455                                                     USEC_PER_SEC / HZ);
 456         WRITE_ONCE(*timeo_p, val);
 457         return 0;
 458 }
 459
 460 static bool sock_needs_netstamp(const struct sock *sk)
 461 {
 462         switch (sk->sk_family) {
 463         case AF_UNSPEC:
 464         case AF_UNIX:
 465                 return false;
 466         default:
 467                 return true;
 468         }
 469 }
 470
 471 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 472 {
 473         if (sk->sk_flags & flags) {
 474                 sk->sk_flags &= ~flags;
 475                 if (sock_needs_netstamp(sk) &&
 476                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 477                         net_disable_timestamp();
 478         }
 479 }
 480
 481
 482 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 483 {
 484         unsigned long flags;
 485         struct sk_buff_head *list = &sk->sk_receive_queue;
 486
 487         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 488                 atomic_inc(&sk->sk_drops);
 489                 trace_sock_rcvqueue_full(sk, skb);
 490                 return -ENOMEM;
 491         }
 492
 493         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 494                 atomic_inc(&sk->sk_drops);
 495                 return -ENOBUFS;
 496         }
 497
 498         skb->dev = NULL;
 499         skb_set_owner_r(skb, sk);
 500
 501         /* we escape from rcu protected region, make sure we dont leak
 502          * a norefcounted dst
 503          */
 504         skb_dst_force(skb);
 505
 506         spin_lock_irqsave(&list->lock, flags);
 507         sock_skb_set_dropcount(sk, skb);
 508         __skb_queue_tail(list, skb);
 509         spin_unlock_irqrestore(&list->lock, flags);
 510
 511         if (!sock_flag(sk, SOCK_DEAD))
 512                 sk->sk_data_ready(sk);
 513         return 0;
 514 }
 515 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 516
 517 int sock_queue_rcv_skb_reason(struct sock *sk, struct sk_buff *skb,
 518                               enum skb_drop_reason *reason)
 519 {
 520         enum skb_drop_reason drop_reason;
 521         int err;
 522
 523         err = sk_filter(sk, skb);
 524         if (err) {
 525                 drop_reason = SKB_DROP_REASON_SOCKET_FILTER;
 526                 goto out;
 527         }
 528         err = __sock_queue_rcv_skb(sk, skb);
 529         switch (err) {
 530         case -ENOMEM:
 531                 drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF;
 532                 break;
 533         case -ENOBUFS:
 534                 drop_reason = SKB_DROP_REASON_PROTO_MEM;
 535                 break;
 536         default:
 537                 drop_reason = SKB_NOT_DROPPED_YET;
 538                 break;
 539         }
 540 out:
 541         if (reason)
 542                 *reason = drop_reason;
 543         return err;
 544 }
 545 EXPORT_SYMBOL(sock_queue_rcv_skb_reason);
 546
 547 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 548                      const int nested, unsigned int trim_cap, bool refcounted)
 549 {
 550         int rc = NET_RX_SUCCESS;
 551
 552         if (sk_filter_trim_cap(sk, skb, trim_cap))
 553                 goto discard_and_relse;
 554
 555         skb->dev = NULL;
 556
 557         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 558                 atomic_inc(&sk->sk_drops);
 559                 goto discard_and_relse;
 560         }
 561         if (nested)
 562                 bh_lock_sock_nested(sk);
 563         else
 564                 bh_lock_sock(sk);
 565         if (!sock_owned_by_user(sk)) {
 566                 /*
 567                  * trylock + unlock semantics:
 568                  */
 569                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 570
 571                 rc = sk_backlog_rcv(sk, skb);
 572
 573                 mutex_release(&sk->sk_lock.dep_map, _RET_IP_);
 574         } else if (sk_add_backlog(sk, skb, READ_ONCE(sk->sk_rcvbuf))) {
 575                 bh_unlock_sock(sk);
 576                 atomic_inc(&sk->sk_drops);
 577                 goto discard_and_relse;
 578         }
 579
 580         bh_unlock_sock(sk);
 581 out:
 582         if (refcounted)
 583                 sock_put(sk);
 584         return rc;
 585 discard_and_relse:
 586         kfree_skb(skb);
 587         goto out;
 588 }
 589 EXPORT_SYMBOL(__sk_receive_skb);
 590
 591 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ip6_dst_check(struct dst_entry *,
 592                                                           u32));
 593 INDIRECT_CALLABLE_DECLARE(struct dst_entry *ipv4_dst_check(struct dst_entry *,
 594                                                            u32));
 595 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 596 {
 597         struct dst_entry *dst = __sk_dst_get(sk);
 598
 599         if (dst && dst->obsolete &&
 600             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 601                                dst, cookie) == NULL) {
 602                 sk_tx_queue_clear(sk);
 603                 WRITE_ONCE(sk->sk_dst_pending_confirm, 0);
 604                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 605                 dst_release(dst);
 606                 return NULL;
 607         }
 608
 609         return dst;
 610 }
 611 EXPORT_SYMBOL(__sk_dst_check);
 612
 613 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 614 {
 615         struct dst_entry *dst = sk_dst_get(sk);
 616
 617         if (dst && dst->obsolete &&
 618             INDIRECT_CALL_INET(dst->ops->check, ip6_dst_check, ipv4_dst_check,
 619                                dst, cookie) == NULL) {
 620                 sk_dst_reset(sk);
 621                 dst_release(dst);
 622                 return NULL;
 623         }
 624
 625         return dst;
 626 }
 627 EXPORT_SYMBOL(sk_dst_check);
 628
 629 static int sock_bindtoindex_locked(struct sock *sk, int ifindex)
 630 {
 631         int ret = -ENOPROTOOPT;
 632 #ifdef CONFIG_NETDEVICES
 633         struct net *net = sock_net(sk);
 634
 635         /* Sorry... */
 636         ret = -EPERM;
 637         if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
 638                 goto out;
 639
 640         ret = -EINVAL;
 641         if (ifindex < 0)
 642                 goto out;
 643
 644         /* Paired with all READ_ONCE() done locklessly. */
 645         WRITE_ONCE(sk->sk_bound_dev_if, ifindex);
 646
 647         if (sk->sk_prot->rehash)
 648                 sk->sk_prot->rehash(sk);
 649         sk_dst_reset(sk);
 650
 651         ret = 0;
 652
 653 out:
 654 #endif
 655
 656         return ret;
 657 }
 658
 659 int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk)
 660 {
 661         int ret;
 662
 663         if (lock_sk)
 664                 lock_sock(sk);
 665         ret = sock_bindtoindex_locked(sk, ifindex);
 666         if (lock_sk)
 667                 release_sock(sk);
 668
 669         return ret;
 670 }
 671 EXPORT_SYMBOL(sock_bindtoindex);
 672
 673 static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen)
 674 {
 675         int ret = -ENOPROTOOPT;
 676 #ifdef CONFIG_NETDEVICES
 677         struct net *net = sock_net(sk);
 678         char devname[IFNAMSIZ];
 679         int index;
 680
 681         ret = -EINVAL;
 682         if (optlen < 0)
 683                 goto out;
 684
 685         /* Bind this socket to a particular device like "eth0",
 686          * as specified in the passed interface name. If the
 687          * name is "" or the option length is zero the socket
 688          * is not bound.
 689          */
 690         if (optlen > IFNAMSIZ - 1)
 691                 optlen = IFNAMSIZ - 1;
 692         memset(devname, 0, sizeof(devname));
 693
 694         ret = -EFAULT;
 695         if (copy_from_sockptr(devname, optval, optlen))
 696                 goto out;
 697
 698         index = 0;
 699         if (devname[0] != '\0') {
 700                 struct net_device *dev;
 701
 702                 rcu_read_lock();
 703                 dev = dev_get_by_name_rcu(net, devname);
 704                 if (dev)
 705                         index = dev->ifindex;
 706                 rcu_read_unlock();
 707                 ret = -ENODEV;
 708                 if (!dev)
 709                         goto out;
 710         }
 711
 712         sockopt_lock_sock(sk);
 713         ret = sock_bindtoindex_locked(sk, index);
 714         sockopt_release_sock(sk);
 715 out:
 716 #endif
 717
 718         return ret;
 719 }
 720
 721 static int sock_getbindtodevice(struct sock *sk, sockptr_t optval,
 722                                 sockptr_t optlen, int len)
 723 {
 724         int ret = -ENOPROTOOPT;
 725 #ifdef CONFIG_NETDEVICES
 726         int bound_dev_if = READ_ONCE(sk->sk_bound_dev_if);
 727         struct net *net = sock_net(sk);
 728         char devname[IFNAMSIZ];
 729
 730         if (bound_dev_if == 0) {
 731                 len = 0;
 732                 goto zero;
 733         }
 734
 735         ret = -EINVAL;
 736         if (len < IFNAMSIZ)
 737                 goto out;
 738
 739         ret = netdev_get_name(net, devname, bound_dev_if);
 740         if (ret)
 741                 goto out;
 742
 743         len = strlen(devname) + 1;
 744
 745         ret = -EFAULT;
 746         if (copy_to_sockptr(optval, devname, len))
 747                 goto out;
 748
 749 zero:
 750         ret = -EFAULT;
 751         if (copy_to_sockptr(optlen, &len, sizeof(int)))
 752                 goto out;
 753
 754         ret = 0;
 755
 756 out:
 757 #endif
 758
 759         return ret;
 760 }
 761
 762 bool sk_mc_loop(const struct sock *sk)
 763 {
 764         if (dev_recursion_level())
 765                 return false;
 766         if (!sk)
 767                 return true;
 768         /* IPV6_ADDRFORM can change sk->sk_family under us. */
 769         switch (READ_ONCE(sk->sk_family)) {
 770         case AF_INET:
 771                 return inet_test_bit(MC_LOOP, sk);
 772 #if IS_ENABLED(CONFIG_IPV6)
 773         case AF_INET6:
 774                 return inet6_test_bit(MC6_LOOP, sk);
 775 #endif
 776         }
 777         WARN_ON_ONCE(1);
 778         return true;
 779 }
 780 EXPORT_SYMBOL(sk_mc_loop);
 781
 782 void sock_set_reuseaddr(struct sock *sk)
 783 {
 784         lock_sock(sk);
 785         sk->sk_reuse = SK_CAN_REUSE;
 786         release_sock(sk);
 787 }
 788 EXPORT_SYMBOL(sock_set_reuseaddr);
 789
 790 void sock_set_reuseport(struct sock *sk)
 791 {
 792         lock_sock(sk);
 793         sk->sk_reuseport = true;
 794         release_sock(sk);
 795 }
 796 EXPORT_SYMBOL(sock_set_reuseport);
 797
 798 void sock_no_linger(struct sock *sk)
 799 {
 800         lock_sock(sk);
 801         WRITE_ONCE(sk->sk_lingertime, 0);
 802         sock_set_flag(sk, SOCK_LINGER);
 803         release_sock(sk);
 804 }
 805 EXPORT_SYMBOL(sock_no_linger);
 806
 807 void sock_set_priority(struct sock *sk, u32 priority)
 808 {
 809         WRITE_ONCE(sk->sk_priority, priority);
 810 }
 811 EXPORT_SYMBOL(sock_set_priority);
 812
 813 void sock_set_sndtimeo(struct sock *sk, s64 secs)
 814 {
 815         lock_sock(sk);
 816         if (secs && secs < MAX_SCHEDULE_TIMEOUT / HZ - 1)
 817                 WRITE_ONCE(sk->sk_sndtimeo, secs * HZ);
 818         else
 819                 WRITE_ONCE(sk->sk_sndtimeo, MAX_SCHEDULE_TIMEOUT);
 820         release_sock(sk);
 821 }
 822 EXPORT_SYMBOL(sock_set_sndtimeo);
 823
 824 static void __sock_set_timestamps(struct sock *sk, bool val, bool new, bool ns)
 825 {
 826         if (val)  {
 827                 sock_valbool_flag(sk, SOCK_TSTAMP_NEW, new);
 828                 sock_valbool_flag(sk, SOCK_RCVTSTAMPNS, ns);
 829                 sock_set_flag(sk, SOCK_RCVTSTAMP);
 830                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 831         } else {
 832                 sock_reset_flag(sk, SOCK_RCVTSTAMP);
 833                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 834         }
 835 }
 836
 837 void sock_enable_timestamps(struct sock *sk)
 838 {
 839         lock_sock(sk);
 840         __sock_set_timestamps(sk, true, false, true);
 841         release_sock(sk);
 842 }
 843 EXPORT_SYMBOL(sock_enable_timestamps);
 844
 845 void sock_set_timestamp(struct sock *sk, int optname, bool valbool)
 846 {
 847         switch (optname) {
 848         case SO_TIMESTAMP_OLD:
 849                 __sock_set_timestamps(sk, valbool, false, false);
 850                 break;
 851         case SO_TIMESTAMP_NEW:
 852                 __sock_set_timestamps(sk, valbool, true, false);
 853                 break;
 854         case SO_TIMESTAMPNS_OLD:
 855                 __sock_set_timestamps(sk, valbool, false, true);
 856                 break;
 857         case SO_TIMESTAMPNS_NEW:
 858                 __sock_set_timestamps(sk, valbool, true, true);
 859                 break;
 860         }
 861 }
 862
 863 static int sock_timestamping_bind_phc(struct sock *sk, int phc_index)
 864 {
 865         struct net *net = sock_net(sk);
 866         struct net_device *dev = NULL;
 867         bool match = false;
 868         int *vclock_index;
 869         int i, num;
 870
 871         if (sk->sk_bound_dev_if)
 872                 dev = dev_get_by_index(net, sk->sk_bound_dev_if);
 873
 874         if (!dev) {
 875                 pr_err("%s: sock not bind to device\n", __func__);
 876                 return -EOPNOTSUPP;
 877         }
 878
 879         num = ethtool_get_phc_vclocks(dev, &vclock_index);
 880         dev_put(dev);
 881
 882         for (i = 0; i < num; i++) {
 883                 if (*(vclock_index + i) == phc_index) {
 884                         match = true;
 885                         break;
 886                 }
 887         }
 888
 889         if (num > 0)
 890                 kfree(vclock_index);
 891
 892         if (!match)
 893                 return -EINVAL;
 894
 895         WRITE_ONCE(sk->sk_bind_phc, phc_index);
 896
 897         return 0;
 898 }
 899
 900 int sock_set_timestamping(struct sock *sk, int optname,
 901                           struct so_timestamping timestamping)
 902 {
 903         int val = timestamping.flags;
 904         int ret;
 905
 906         if (val & ~SOF_TIMESTAMPING_MASK)
 907                 return -EINVAL;
 908
 909         if (val & SOF_TIMESTAMPING_OPT_ID_TCP &&
 910             !(val & SOF_TIMESTAMPING_OPT_ID))
 911                 return -EINVAL;
 912
 913         if (val & SOF_TIMESTAMPING_OPT_ID &&
 914             !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 915                 if (sk_is_tcp(sk)) {
 916                         if ((1 << sk->sk_state) &
 917                             (TCPF_CLOSE | TCPF_LISTEN))
 918                                 return -EINVAL;
 919                         if (val & SOF_TIMESTAMPING_OPT_ID_TCP)
 920                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->write_seq);
 921                         else
 922                                 atomic_set(&sk->sk_tskey, tcp_sk(sk)->snd_una);
 923                 } else {
 924                         atomic_set(&sk->sk_tskey, 0);
 925                 }
 926         }
 927
 928         if (val & SOF_TIMESTAMPING_OPT_STATS &&
 929             !(val & SOF_TIMESTAMPING_OPT_TSONLY))
 930                 return -EINVAL;
 931
 932         if (val & SOF_TIMESTAMPING_BIND_PHC) {
 933                 ret = sock_timestamping_bind_phc(sk, timestamping.bind_phc);
 934                 if (ret)
 935                         return ret;
 936         }
 937
 938         WRITE_ONCE(sk->sk_tsflags, val);
 939         sock_valbool_flag(sk, SOCK_TSTAMP_NEW, optname == SO_TIMESTAMPING_NEW);
 940
 941         if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 942                 sock_enable_timestamp(sk,
 943                                       SOCK_TIMESTAMPING_RX_SOFTWARE);
 944         else
 945                 sock_disable_timestamp(sk,
 946                                        (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 947         return 0;
 948 }
 949
 950 void sock_set_keepalive(struct sock *sk)
 951 {
 952         lock_sock(sk);
 953         if (sk->sk_prot->keepalive)
 954                 sk->sk_prot->keepalive(sk, true);
 955         sock_valbool_flag(sk, SOCK_KEEPOPEN, true);
 956         release_sock(sk);
 957 }
 958 EXPORT_SYMBOL(sock_set_keepalive);
 959
 960 static void __sock_set_rcvbuf(struct sock *sk, int val)
 961 {
 962         /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
 963          * as a negative value.
 964          */
 965         val = min_t(int, val, INT_MAX / 2);
 966         sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 967
 968         /* We double it on the way in to account for "struct sk_buff" etc.
 969          * overhead.   Applications assume that the SO_RCVBUF setting they make
 970          * will allow that much actual data to be received on that socket.
 971          *
 972          * Applications are unaware that "struct sk_buff" and other overheads
 973          * allocate from the receive buffer during socket buffer allocation.
 974          *
 975          * And after considering the possible alternatives, returning the value
 976          * we actually used in getsockopt is the most desirable behavior.
 977          */
 978         WRITE_ONCE(sk->sk_rcvbuf, max_t(int, val * 2, SOCK_MIN_RCVBUF));
 979 }
 980
 981 void sock_set_rcvbuf(struct sock *sk, int val)
 982 {
 983         lock_sock(sk);
 984         __sock_set_rcvbuf(sk, val);
 985         release_sock(sk);
 986 }
 987 EXPORT_SYMBOL(sock_set_rcvbuf);
 988
 989 static void __sock_set_mark(struct sock *sk, u32 val)
 990 {
 991         if (val != sk->sk_mark) {
 992                 WRITE_ONCE(sk->sk_mark, val);
 993                 sk_dst_reset(sk);
 994         }
 995 }
 996
 997 void sock_set_mark(struct sock *sk, u32 val)
 998 {
 999         lock_sock(sk);
1000         __sock_set_mark(sk, val);
1001         release_sock(sk);
1002 }
1003 EXPORT_SYMBOL(sock_set_mark);
1004
1005 static void sock_release_reserved_memory(struct sock *sk, int bytes)
1006 {
1007         /* Round down bytes to multiple of pages */
1008         bytes = round_down(bytes, PAGE_SIZE);
1009
1010         WARN_ON(bytes > sk->sk_reserved_mem);
1011         WRITE_ONCE(sk->sk_reserved_mem, sk->sk_reserved_mem - bytes);
1012         sk_mem_reclaim(sk);
1013 }
1014
1015 static int sock_reserve_memory(struct sock *sk, int bytes)
1016 {
1017         long allocated;
1018         bool charged;
1019         int pages;
1020
1021         if (!mem_cgroup_sockets_enabled || !sk->sk_memcg || !sk_has_account(sk))
1022                 return -EOPNOTSUPP;
1023
1024         if (!bytes)
1025                 return 0;
1026
1027         pages = sk_mem_pages(bytes);
1028
1029         /* pre-charge to memcg */
1030         charged = mem_cgroup_charge_skmem(sk->sk_memcg, pages,
1031                                           GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1032         if (!charged)
1033                 return -ENOMEM;
1034
1035         /* pre-charge to forward_alloc */
1036         sk_memory_allocated_add(sk, pages);
1037         allocated = sk_memory_allocated(sk);
1038         /* If the system goes into memory pressure with this
1039          * precharge, give up and return error.
1040          */
1041         if (allocated > sk_prot_mem_limits(sk, 1)) {
1042                 sk_memory_allocated_sub(sk, pages);
1043                 mem_cgroup_uncharge_skmem(sk->sk_memcg, pages);
1044                 return -ENOMEM;
1045         }
1046         sk_forward_alloc_add(sk, pages << PAGE_SHIFT);
1047
1048         WRITE_ONCE(sk->sk_reserved_mem,
1049                    sk->sk_reserved_mem + (pages << PAGE_SHIFT));
1050
1051         return 0;
1052 }
1053
1054 void sockopt_lock_sock(struct sock *sk)
1055 {
1056         /* When current->bpf_ctx is set, the setsockopt is called from
1057          * a bpf prog.  bpf has ensured the sk lock has been
1058          * acquired before calling setsockopt().
1059          */
1060         if (has_current_bpf_ctx())
1061                 return;
1062
1063         lock_sock(sk);
1064 }
1065 EXPORT_SYMBOL(sockopt_lock_sock);
1066
1067 void sockopt_release_sock(struct sock *sk)
1068 {
1069         if (has_current_bpf_ctx())
1070                 return;
1071
1072         release_sock(sk);
1073 }
1074 EXPORT_SYMBOL(sockopt_release_sock);
1075
1076 bool sockopt_ns_capable(struct user_namespace *ns, int cap)
1077 {
1078         return has_current_bpf_ctx() || ns_capable(ns, cap);
1079 }
1080 EXPORT_SYMBOL(sockopt_ns_capable);
1081
1082 bool sockopt_capable(int cap)
1083 {
1084         return has_current_bpf_ctx() || capable(cap);
1085 }
1086 EXPORT_SYMBOL(sockopt_capable);
1087
1088 /*
1089  *      This is meant for all protocols to use and covers goings on
1090  *      at the socket level. Everything here is generic.
1091  */
1092
1093 int sk_setsockopt(struct sock *sk, int level, int optname,
1094                   sockptr_t optval, unsigned int optlen)
1095 {
1096         struct so_timestamping timestamping;
1097         struct socket *sock = sk->sk_socket;
1098         struct sock_txtime sk_txtime;
1099         int val;
1100         int valbool;
1101         struct linger ling;
1102         int ret = 0;
1103
1104         /*
1105          *      Options without arguments
1106          */
1107
1108         if (optname == SO_BINDTODEVICE)
1109                 return sock_setbindtodevice(sk, optval, optlen);
1110
1111         if (optlen < sizeof(int))
1112                 return -EINVAL;
1113
1114         if (copy_from_sockptr(&val, optval, sizeof(val)))
1115                 return -EFAULT;
1116
1117         valbool = val ? 1 : 0;
1118
1119         /* handle options which do not require locking the socket. */
1120         switch (optname) {
1121         case SO_PRIORITY:
1122                 if ((val >= 0 && val <= 6) ||
1123                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) ||
1124                     sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1125                         sock_set_priority(sk, val);
1126                         return 0;
1127                 }
1128                 return -EPERM;
1129         case SO_PASSSEC:
1130                 assign_bit(SOCK_PASSSEC, &sock->flags, valbool);
1131                 return 0;
1132         case SO_PASSCRED:
1133                 assign_bit(SOCK_PASSCRED, &sock->flags, valbool);
1134                 return 0;
1135         case SO_PASSPIDFD:
1136                 assign_bit(SOCK_PASSPIDFD, &sock->flags, valbool);
1137                 return 0;
1138         case SO_TYPE:
1139         case SO_PROTOCOL:
1140         case SO_DOMAIN:
1141         case SO_ERROR:
1142                 return -ENOPROTOOPT;
1143 #ifdef CONFIG_NET_RX_BUSY_POLL
1144         case SO_BUSY_POLL:
1145                 if (val < 0)
1146                         return -EINVAL;
1147                 WRITE_ONCE(sk->sk_ll_usec, val);
1148                 return 0;
1149         case SO_PREFER_BUSY_POLL:
1150                 if (valbool && !sockopt_capable(CAP_NET_ADMIN))
1151                         return -EPERM;
1152                 WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
1153                 return 0;
1154         case SO_BUSY_POLL_BUDGET:
1155                 if (val > READ_ONCE(sk->sk_busy_poll_budget) &&
1156                     !sockopt_capable(CAP_NET_ADMIN))
1157                         return -EPERM;
1158                 if (val < 0 || val > U16_MAX)
1159                         return -EINVAL;
1160                 WRITE_ONCE(sk->sk_busy_poll_budget, val);
1161                 return 0;
1162 #endif
1163         case SO_MAX_PACING_RATE:
1164                 {
1165                 unsigned long ulval = (val == ~0U) ? ~0UL : (unsigned int)val;
1166                 unsigned long pacing_rate;
1167
1168                 if (sizeof(ulval) != sizeof(val) &&
1169                     optlen >= sizeof(ulval) &&
1170                     copy_from_sockptr(&ulval, optval, sizeof(ulval))) {
1171                         return -EFAULT;
1172                 }
1173                 if (ulval != ~0UL)
1174                         cmpxchg(&sk->sk_pacing_status,
1175                                 SK_PACING_NONE,
1176                                 SK_PACING_NEEDED);
1177                 /* Pairs with READ_ONCE() from sk_getsockopt() */
1178                 WRITE_ONCE(sk->sk_max_pacing_rate, ulval);
1179                 pacing_rate = READ_ONCE(sk->sk_pacing_rate);
1180                 if (ulval < pacing_rate)
1181                         WRITE_ONCE(sk->sk_pacing_rate, ulval);
1182                 return 0;
1183                 }
1184         case SO_TXREHASH:
1185                 if (val < -1 || val > 1)
1186                         return -EINVAL;
1187                 if ((u8)val == SOCK_TXREHASH_DEFAULT)
1188                         val = READ_ONCE(sock_net(sk)->core.sysctl_txrehash);
1189                 /* Paired with READ_ONCE() in tcp_rtx_synack()
1190                  * and sk_getsockopt().
1191                  */
1192                 WRITE_ONCE(sk->sk_txrehash, (u8)val);
1193                 return 0;
1194         }
1195
1196         sockopt_lock_sock(sk);
1197
1198         switch (optname) {
1199         case SO_DEBUG:
1200                 if (val && !sockopt_capable(CAP_NET_ADMIN))
1201                         ret = -EACCES;
1202                 else
1203                         sock_valbool_flag(sk, SOCK_DBG, valbool);
1204                 break;
1205         case SO_REUSEADDR:
1206                 sk->sk_reuse = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
1207                 break;
1208         case SO_REUSEPORT:
1209                 sk->sk_reuseport = valbool;
1210                 break;
1211         case SO_DONTROUTE:
1212                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
1213                 sk_dst_reset(sk);
1214                 break;
1215         case SO_BROADCAST:
1216                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
1217                 break;
1218         case SO_SNDBUF:
1219                 /* Don't error on this BSD doesn't and if you think
1220                  * about it this is right. Otherwise apps have to
1221                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1222                  * are treated in BSD as hints
1223                  */
1224                 val = min_t(u32, val, READ_ONCE(sysctl_wmem_max));
1225 set_sndbuf:
1226                 /* Ensure val * 2 fits into an int, to prevent max_t()
1227                  * from treating it as a negative value.
1228                  */
1229                 val = min_t(int, val, INT_MAX / 2);
1230                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
1231                 WRITE_ONCE(sk->sk_sndbuf,
1232                            max_t(int, val * 2, SOCK_MIN_SNDBUF));
1233                 /* Wake up sending tasks if we upped the value. */
1234                 sk->sk_write_space(sk);
1235                 break;
1236
1237         case SO_SNDBUFFORCE:
1238                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1239                         ret = -EPERM;
1240                         break;
1241                 }
1242
1243                 /* No negative values (to prevent underflow, as val will be
1244                  * multiplied by 2).
1245                  */
1246                 if (val < 0)
1247                         val = 0;
1248                 goto set_sndbuf;
1249
1250         case SO_RCVBUF:
1251                 /* Don't error on this BSD doesn't and if you think
1252                  * about it this is right. Otherwise apps have to
1253                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
1254                  * are treated in BSD as hints
1255                  */
1256                 __sock_set_rcvbuf(sk, min_t(u32, val, READ_ONCE(sysctl_rmem_max)));
1257                 break;
1258
1259         case SO_RCVBUFFORCE:
1260                 if (!sockopt_capable(CAP_NET_ADMIN)) {
1261                         ret = -EPERM;
1262                         break;
1263                 }
1264
1265                 /* No negative values (to prevent underflow, as val will be
1266                  * multiplied by 2).
1267                  */
1268                 __sock_set_rcvbuf(sk, max(val, 0));
1269                 break;
1270
1271         case SO_KEEPALIVE:
1272                 if (sk->sk_prot->keepalive)
1273                         sk->sk_prot->keepalive(sk, valbool);
1274                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
1275                 break;
1276
1277         case SO_OOBINLINE:
1278                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
1279                 break;
1280
1281         case SO_NO_CHECK:
1282                 sk->sk_no_check_tx = valbool;
1283                 break;
1284
1285         case SO_LINGER:
1286                 if (optlen < sizeof(ling)) {
1287                         ret = -EINVAL;  /* 1003.1g */
1288                         break;
1289                 }
1290                 if (copy_from_sockptr(&ling, optval, sizeof(ling))) {
1291                         ret = -EFAULT;
1292                         break;
1293                 }
1294                 if (!ling.l_onoff) {
1295                         sock_reset_flag(sk, SOCK_LINGER);
1296                 } else {
1297                         unsigned long t_sec = ling.l_linger;
1298
1299                         if (t_sec >= MAX_SCHEDULE_TIMEOUT / HZ)
1300                                 WRITE_ONCE(sk->sk_lingertime, MAX_SCHEDULE_TIMEOUT);
1301                         else
1302                                 WRITE_ONCE(sk->sk_lingertime, t_sec * HZ);
1303                         sock_set_flag(sk, SOCK_LINGER);
1304                 }
1305                 break;
1306
1307         case SO_BSDCOMPAT:
1308                 break;
1309
1310         case SO_TIMESTAMP_OLD:
1311         case SO_TIMESTAMP_NEW:
1312         case SO_TIMESTAMPNS_OLD:
1313         case SO_TIMESTAMPNS_NEW:
1314                 sock_set_timestamp(sk, optname, valbool);
1315                 break;
1316
1317         case SO_TIMESTAMPING_NEW:
1318         case SO_TIMESTAMPING_OLD:
1319                 if (optlen == sizeof(timestamping)) {
1320                         if (copy_from_sockptr(&timestamping, optval,
1321                                               sizeof(timestamping))) {
1322                                 ret = -EFAULT;
1323                                 break;
1324                         }
1325                 } else {
1326                         memset(&timestamping, 0, sizeof(timestamping));
1327                         timestamping.flags = val;
1328                 }
1329                 ret = sock_set_timestamping(sk, optname, timestamping);
1330                 break;
1331
1332         case SO_RCVLOWAT:
1333                 {
1334                 int (*set_rcvlowat)(struct sock *sk, int val) = NULL;
1335
1336                 if (val < 0)
1337                         val = INT_MAX;
1338                 if (sock)
1339                         set_rcvlowat = READ_ONCE(sock->ops)->set_rcvlowat;
1340                 if (set_rcvlowat)
1341                         ret = set_rcvlowat(sk, val);
1342                 else
1343                         WRITE_ONCE(sk->sk_rcvlowat, val ? : 1);
1344                 break;
1345                 }
1346         case SO_RCVTIMEO_OLD:
1347         case SO_RCVTIMEO_NEW:
1348                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval,
1349                                        optlen, optname == SO_RCVTIMEO_OLD);
1350                 break;
1351
1352         case SO_SNDTIMEO_OLD:
1353         case SO_SNDTIMEO_NEW:
1354                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval,
1355                                        optlen, optname == SO_SNDTIMEO_OLD);
1356                 break;
1357
1358         case SO_ATTACH_FILTER: {
1359                 struct sock_fprog fprog;
1360
1361                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1362                 if (!ret)
1363                         ret = sk_attach_filter(&fprog, sk);
1364                 break;
1365         }
1366         case SO_ATTACH_BPF:
1367                 ret = -EINVAL;
1368                 if (optlen == sizeof(u32)) {
1369                         u32 ufd;
1370
1371                         ret = -EFAULT;
1372                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1373                                 break;
1374
1375                         ret = sk_attach_bpf(ufd, sk);
1376                 }
1377                 break;
1378
1379         case SO_ATTACH_REUSEPORT_CBPF: {
1380                 struct sock_fprog fprog;
1381
1382                 ret = copy_bpf_fprog_from_user(&fprog, optval, optlen);
1383                 if (!ret)
1384                         ret = sk_reuseport_attach_filter(&fprog, sk);
1385                 break;
1386         }
1387         case SO_ATTACH_REUSEPORT_EBPF:
1388                 ret = -EINVAL;
1389                 if (optlen == sizeof(u32)) {
1390                         u32 ufd;
1391
1392                         ret = -EFAULT;
1393                         if (copy_from_sockptr(&ufd, optval, sizeof(ufd)))
1394                                 break;
1395
1396                         ret = sk_reuseport_attach_bpf(ufd, sk);
1397                 }
1398                 break;
1399
1400         case SO_DETACH_REUSEPORT_BPF:
1401                 ret = reuseport_detach_prog(sk);
1402                 break;
1403
1404         case SO_DETACH_FILTER:
1405                 ret = sk_detach_filter(sk);
1406                 break;
1407
1408         case SO_LOCK_FILTER:
1409                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
1410                         ret = -EPERM;
1411                 else
1412                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1413                 break;
1414
1415         case SO_MARK:
1416                 if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
1417                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1418                         ret = -EPERM;
1419                         break;
1420                 }
1421
1422                 __sock_set_mark(sk, val);
1423                 break;
1424         case SO_RCVMARK:
1425                 sock_valbool_flag(sk, SOCK_RCVMARK, valbool);
1426                 break;
1427
1428         case SO_RXQ_OVFL:
1429                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1430                 break;
1431
1432         case SO_WIFI_STATUS:
1433                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1434                 break;
1435
1436         case SO_PEEK_OFF:
1437                 {
1438                 int (*set_peek_off)(struct sock *sk, int val);
1439
1440                 set_peek_off = READ_ONCE(sock->ops)->set_peek_off;
1441                 if (set_peek_off)
1442                         ret = set_peek_off(sk, val);
1443                 else
1444                         ret = -EOPNOTSUPP;
1445                 break;
1446                 }
1447
1448         case SO_NOFCS:
1449                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1450                 break;
1451
1452         case SO_SELECT_ERR_QUEUE:
1453                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1454                 break;
1455
1456
1457         case SO_INCOMING_CPU:
1458                 reuseport_update_incoming_cpu(sk, val);
1459                 break;
1460
1461         case SO_CNX_ADVICE:
1462                 if (val == 1)
1463                         dst_negative_advice(sk);
1464                 break;
1465
1466         case SO_ZEROCOPY:
1467                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1468                         if (!(sk_is_tcp(sk) ||
1469                               (sk->sk_type == SOCK_DGRAM &&
1470                                sk->sk_protocol == IPPROTO_UDP)))
1471                                 ret = -EOPNOTSUPP;
1472                 } else if (sk->sk_family != PF_RDS) {
1473                         ret = -EOPNOTSUPP;
1474                 }
1475                 if (!ret) {
1476                         if (val < 0 || val > 1)
1477                                 ret = -EINVAL;
1478                         else
1479                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1480                 }
1481                 break;
1482
1483         case SO_TXTIME:
1484                 if (optlen != sizeof(struct sock_txtime)) {
1485                         ret = -EINVAL;
1486                         break;
1487                 } else if (copy_from_sockptr(&sk_txtime, optval,
1488                            sizeof(struct sock_txtime))) {
1489                         ret = -EFAULT;
1490                         break;
1491                 } else if (sk_txtime.flags & ~SOF_TXTIME_FLAGS_MASK) {
1492                         ret = -EINVAL;
1493                         break;
1494                 }
1495                 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1496                  * scheduler has enough safe guards.
1497                  */
1498                 if (sk_txtime.clockid != CLOCK_MONOTONIC &&
1499                     !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) {
1500                         ret = -EPERM;
1501                         break;
1502                 }
1503                 sock_valbool_flag(sk, SOCK_TXTIME, true);
1504                 sk->sk_clockid = sk_txtime.clockid;
1505                 sk->sk_txtime_deadline_mode =
1506                         !!(sk_txtime.flags & SOF_TXTIME_DEADLINE_MODE);
1507                 sk->sk_txtime_report_errors =
1508                         !!(sk_txtime.flags & SOF_TXTIME_REPORT_ERRORS);
1509                 break;
1510
1511         case SO_BINDTOIFINDEX:
1512                 ret = sock_bindtoindex_locked(sk, val);
1513                 break;
1514
1515         case SO_BUF_LOCK:
1516                 if (val & ~SOCK_BUF_LOCK_MASK) {
1517                         ret = -EINVAL;
1518                         break;
1519                 }
1520                 sk->sk_userlocks = val | (sk->sk_userlocks &
1521                                           ~SOCK_BUF_LOCK_MASK);
1522                 break;
1523
1524         case SO_RESERVE_MEM:
1525         {
1526                 int delta;
1527
1528                 if (val < 0) {
1529                         ret = -EINVAL;
1530                         break;
1531                 }
1532
1533                 delta = val - sk->sk_reserved_mem;
1534                 if (delta < 0)
1535                         sock_release_reserved_memory(sk, -delta);
1536                 else
1537                         ret = sock_reserve_memory(sk, delta);
1538                 break;
1539         }
1540
1541         default:
1542                 ret = -ENOPROTOOPT;
1543                 break;
1544         }
1545         sockopt_release_sock(sk);
1546         return ret;
1547 }
1548
1549 int sock_setsockopt(struct socket *sock, int level, int optname,
1550                     sockptr_t optval, unsigned int optlen)
1551 {
1552         return sk_setsockopt(sock->sk, level, optname,
1553                              optval, optlen);
1554 }
1555 EXPORT_SYMBOL(sock_setsockopt);
1556
1557 static const struct cred *sk_get_peer_cred(struct sock *sk)
1558 {
1559         const struct cred *cred;
1560
1561         spin_lock(&sk->sk_peer_lock);
1562         cred = get_cred(sk->sk_peer_cred);
1563         spin_unlock(&sk->sk_peer_lock);
1564
1565         return cred;
1566 }
1567
1568 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1569                           struct ucred *ucred)
1570 {
1571         ucred->pid = pid_vnr(pid);
1572         ucred->uid = ucred->gid = -1;
1573         if (cred) {
1574                 struct user_namespace *current_ns = current_user_ns();
1575
1576                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1577                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1578         }
1579 }
1580
1581 static int groups_to_user(sockptr_t dst, const struct group_info *src)
1582 {
1583         struct user_namespace *user_ns = current_user_ns();
1584         int i;
1585
1586         for (i = 0; i < src->ngroups; i++) {
1587                 gid_t gid = from_kgid_munged(user_ns, src->gid[i]);
1588
1589                 if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid)))
1590                         return -EFAULT;
1591         }
1592
1593         return 0;
1594 }
1595
1596 int sk_getsockopt(struct sock *sk, int level, int optname,
1597                   sockptr_t optval, sockptr_t optlen)
1598 {
1599         struct socket *sock = sk->sk_socket;
1600
1601         union {
1602                 int val;
1603                 u64 val64;
1604                 unsigned long ulval;
1605                 struct linger ling;
1606                 struct old_timeval32 tm32;
1607                 struct __kernel_old_timeval tm;
1608                 struct  __kernel_sock_timeval stm;
1609                 struct sock_txtime txtime;
1610                 struct so_timestamping timestamping;
1611         } v;
1612
1613         int lv = sizeof(int);
1614         int len;
1615
1616         if (copy_from_sockptr(&len, optlen, sizeof(int)))
1617                 return -EFAULT;
1618         if (len < 0)
1619                 return -EINVAL;
1620
1621         memset(&v, 0, sizeof(v));
1622
1623         switch (optname) {
1624         case SO_DEBUG:
1625                 v.val = sock_flag(sk, SOCK_DBG);
1626                 break;
1627
1628         case SO_DONTROUTE:
1629                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1630                 break;
1631
1632         case SO_BROADCAST:
1633                 v.val = sock_flag(sk, SOCK_BROADCAST);
1634                 break;
1635
1636         case SO_SNDBUF:
1637                 v.val = READ_ONCE(sk->sk_sndbuf);
1638                 break;
1639
1640         case SO_RCVBUF:
1641                 v.val = READ_ONCE(sk->sk_rcvbuf);
1642                 break;
1643
1644         case SO_REUSEADDR:
1645                 v.val = sk->sk_reuse;
1646                 break;
1647
1648         case SO_REUSEPORT:
1649                 v.val = sk->sk_reuseport;
1650                 break;
1651
1652         case SO_KEEPALIVE:
1653                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1654                 break;
1655
1656         case SO_TYPE:
1657                 v.val = sk->sk_type;
1658                 break;
1659
1660         case SO_PROTOCOL:
1661                 v.val = sk->sk_protocol;
1662                 break;
1663
1664         case SO_DOMAIN:
1665                 v.val = sk->sk_family;
1666                 break;
1667
1668         case SO_ERROR:
1669                 v.val = -sock_error(sk);
1670                 if (v.val == 0)
1671                         v.val = xchg(&sk->sk_err_soft, 0);
1672                 break;
1673
1674         case SO_OOBINLINE:
1675                 v.val = sock_flag(sk, SOCK_URGINLINE);
1676                 break;
1677
1678         case SO_NO_CHECK:
1679                 v.val = sk->sk_no_check_tx;
1680                 break;
1681
1682         case SO_PRIORITY:
1683                 v.val = READ_ONCE(sk->sk_priority);
1684                 break;
1685
1686         case SO_LINGER:
1687                 lv              = sizeof(v.ling);
1688                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1689                 v.ling.l_linger = READ_ONCE(sk->sk_lingertime) / HZ;
1690                 break;
1691
1692         case SO_BSDCOMPAT:
1693                 break;
1694
1695         case SO_TIMESTAMP_OLD:
1696                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1697                                 !sock_flag(sk, SOCK_TSTAMP_NEW) &&
1698                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1699                 break;
1700
1701         case SO_TIMESTAMPNS_OLD:
1702                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && !sock_flag(sk, SOCK_TSTAMP_NEW);
1703                 break;
1704
1705         case SO_TIMESTAMP_NEW:
1706                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) && sock_flag(sk, SOCK_TSTAMP_NEW);
1707                 break;
1708
1709         case SO_TIMESTAMPNS_NEW:
1710                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS) && sock_flag(sk, SOCK_TSTAMP_NEW);
1711                 break;
1712
1713         case SO_TIMESTAMPING_OLD:
1714         case SO_TIMESTAMPING_NEW:
1715                 lv = sizeof(v.timestamping);
1716                 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1717                  * returning the flags when they were set through the same option.
1718                  * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1719                  */
1720                 if (optname == SO_TIMESTAMPING_OLD || sock_flag(sk, SOCK_TSTAMP_NEW)) {
1721                         v.timestamping.flags = READ_ONCE(sk->sk_tsflags);
1722                         v.timestamping.bind_phc = READ_ONCE(sk->sk_bind_phc);
1723                 }
1724                 break;
1725
1726         case SO_RCVTIMEO_OLD:
1727         case SO_RCVTIMEO_NEW:
1728                 lv = sock_get_timeout(READ_ONCE(sk->sk_rcvtimeo), &v,
1729                                       SO_RCVTIMEO_OLD == optname);
1730                 break;
1731
1732         case SO_SNDTIMEO_OLD:
1733         case SO_SNDTIMEO_NEW:
1734                 lv = sock_get_timeout(READ_ONCE(sk->sk_sndtimeo), &v,
1735                                       SO_SNDTIMEO_OLD == optname);
1736                 break;
1737
1738         case SO_RCVLOWAT:
1739                 v.val = READ_ONCE(sk->sk_rcvlowat);
1740                 break;
1741
1742         case SO_SNDLOWAT:
1743                 v.val = 1;
1744                 break;
1745
1746         case SO_PASSCRED:
1747                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1748                 break;
1749
1750         case SO_PASSPIDFD:
1751                 v.val = !!test_bit(SOCK_PASSPIDFD, &sock->flags);
1752                 break;
1753
1754         case SO_PEERCRED:
1755         {
1756                 struct ucred peercred;
1757                 if (len > sizeof(peercred))
1758                         len = sizeof(peercred);
1759
1760                 spin_lock(&sk->sk_peer_lock);
1761                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1762                 spin_unlock(&sk->sk_peer_lock);
1763
1764                 if (copy_to_sockptr(optval, &peercred, len))
1765                         return -EFAULT;
1766                 goto lenout;
1767         }
1768
1769         case SO_PEERPIDFD:
1770         {
1771                 struct pid *peer_pid;
1772                 struct file *pidfd_file = NULL;
1773                 int pidfd;
1774
1775                 if (len > sizeof(pidfd))
1776                         len = sizeof(pidfd);
1777
1778                 spin_lock(&sk->sk_peer_lock);
1779                 peer_pid = get_pid(sk->sk_peer_pid);
1780                 spin_unlock(&sk->sk_peer_lock);
1781
1782                 if (!peer_pid)
1783                         return -ENODATA;
1784
1785                 pidfd = pidfd_prepare(peer_pid, 0, &pidfd_file);
1786                 put_pid(peer_pid);
1787                 if (pidfd < 0)
1788                         return pidfd;
1789
1790                 if (copy_to_sockptr(optval, &pidfd, len) ||
1791                     copy_to_sockptr(optlen, &len, sizeof(int))) {
1792                         put_unused_fd(pidfd);
1793                         fput(pidfd_file);
1794
1795                         return -EFAULT;
1796                 }
1797
1798                 fd_install(pidfd, pidfd_file);
1799                 return 0;
1800         }
1801
1802         case SO_PEERGROUPS:
1803         {
1804                 const struct cred *cred;
1805                 int ret, n;
1806
1807                 cred = sk_get_peer_cred(sk);
1808                 if (!cred)
1809                         return -ENODATA;
1810
1811                 n = cred->group_info->ngroups;
1812                 if (len < n * sizeof(gid_t)) {
1813                         len = n * sizeof(gid_t);
1814                         put_cred(cred);
1815                         return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE;
1816                 }
1817                 len = n * sizeof(gid_t);
1818
1819                 ret = groups_to_user(optval, cred->group_info);
1820                 put_cred(cred);
1821                 if (ret)
1822                         return ret;
1823                 goto lenout;
1824         }
1825
1826         case SO_PEERNAME:
1827         {
1828                 struct sockaddr_storage address;
1829
1830                 lv = READ_ONCE(sock->ops)->getname(sock, (struct sockaddr *)&address, 2);
1831                 if (lv < 0)
1832                         return -ENOTCONN;
1833                 if (lv < len)
1834                         return -EINVAL;
1835                 if (copy_to_sockptr(optval, &address, len))
1836                         return -EFAULT;
1837                 goto lenout;
1838         }
1839
1840         /* Dubious BSD thing... Probably nobody even uses it, but
1841          * the UNIX standard wants it for whatever reason... -DaveM
1842          */
1843         case SO_ACCEPTCONN:
1844                 v.val = sk->sk_state == TCP_LISTEN;
1845                 break;
1846
1847         case SO_PASSSEC:
1848                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1849                 break;
1850
1851         case SO_PEERSEC:
1852                 return security_socket_getpeersec_stream(sock,
1853                                                          optval, optlen, len);
1854
1855         case SO_MARK:
1856                 v.val = READ_ONCE(sk->sk_mark);
1857                 break;
1858
1859         case SO_RCVMARK:
1860                 v.val = sock_flag(sk, SOCK_RCVMARK);
1861                 break;
1862
1863         case SO_RXQ_OVFL:
1864                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1865                 break;
1866
1867         case SO_WIFI_STATUS:
1868                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1869                 break;
1870
1871         case SO_PEEK_OFF:
1872                 if (!READ_ONCE(sock->ops)->set_peek_off)
1873                         return -EOPNOTSUPP;
1874
1875                 v.val = READ_ONCE(sk->sk_peek_off);
1876                 break;
1877         case SO_NOFCS:
1878                 v.val = sock_flag(sk, SOCK_NOFCS);
1879                 break;
1880
1881         case SO_BINDTODEVICE:
1882                 return sock_getbindtodevice(sk, optval, optlen, len);
1883
1884         case SO_GET_FILTER:
1885                 len = sk_get_filter(sk, optval, len);
1886                 if (len < 0)
1887                         return len;
1888
1889                 goto lenout;
1890
1891         case SO_LOCK_FILTER:
1892                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1893                 break;
1894
1895         case SO_BPF_EXTENSIONS:
1896                 v.val = bpf_tell_extensions();
1897                 break;
1898
1899         case SO_SELECT_ERR_QUEUE:
1900                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1901                 break;
1902
1903 #ifdef CONFIG_NET_RX_BUSY_POLL
1904         case SO_BUSY_POLL:
1905                 v.val = READ_ONCE(sk->sk_ll_usec);
1906                 break;
1907         case SO_PREFER_BUSY_POLL:
1908                 v.val = READ_ONCE(sk->sk_prefer_busy_poll);
1909                 break;
1910 #endif
1911
1912         case SO_MAX_PACING_RATE:
1913                 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1914                 if (sizeof(v.ulval) != sizeof(v.val) && len >= sizeof(v.ulval)) {
1915                         lv = sizeof(v.ulval);
1916                         v.ulval = READ_ONCE(sk->sk_max_pacing_rate);
1917                 } else {
1918                         /* 32bit version */
1919                         v.val = min_t(unsigned long, ~0U,
1920                                       READ_ONCE(sk->sk_max_pacing_rate));
1921                 }
1922                 break;
1923
1924         case SO_INCOMING_CPU:
1925                 v.val = READ_ONCE(sk->sk_incoming_cpu);
1926                 break;
1927
1928         case SO_MEMINFO:
1929         {
1930                 u32 meminfo[SK_MEMINFO_VARS];
1931
1932                 sk_get_meminfo(sk, meminfo);
1933
1934                 len = min_t(unsigned int, len, sizeof(meminfo));
1935                 if (copy_to_sockptr(optval, &meminfo, len))
1936                         return -EFAULT;
1937
1938                 goto lenout;
1939         }
1940
1941 #ifdef CONFIG_NET_RX_BUSY_POLL
1942         case SO_INCOMING_NAPI_ID:
1943                 v.val = READ_ONCE(sk->sk_napi_id);
1944
1945                 /* aggregate non-NAPI IDs down to 0 */
1946                 if (v.val < MIN_NAPI_ID)
1947                         v.val = 0;
1948
1949                 break;
1950 #endif
1951
1952         case SO_COOKIE:
1953                 lv = sizeof(u64);
1954                 if (len < lv)
1955                         return -EINVAL;
1956                 v.val64 = sock_gen_cookie(sk);
1957                 break;
1958
1959         case SO_ZEROCOPY:
1960                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1961                 break;
1962
1963         case SO_TXTIME:
1964                 lv = sizeof(v.txtime);
1965                 v.txtime.clockid = sk->sk_clockid;
1966                 v.txtime.flags |= sk->sk_txtime_deadline_mode ?
1967                                   SOF_TXTIME_DEADLINE_MODE : 0;
1968                 v.txtime.flags |= sk->sk_txtime_report_errors ?
1969                                   SOF_TXTIME_REPORT_ERRORS : 0;
1970                 break;
1971
1972         case SO_BINDTOIFINDEX:
1973                 v.val = READ_ONCE(sk->sk_bound_dev_if);
1974                 break;
1975
1976         case SO_NETNS_COOKIE:
1977                 lv = sizeof(u64);
1978                 if (len != lv)
1979                         return -EINVAL;
1980                 v.val64 = sock_net(sk)->net_cookie;
1981                 break;
1982
1983         case SO_BUF_LOCK:
1984                 v.val = sk->sk_userlocks & SOCK_BUF_LOCK_MASK;
1985                 break;
1986
1987         case SO_RESERVE_MEM:
1988                 v.val = READ_ONCE(sk->sk_reserved_mem);
1989                 break;
1990
1991         case SO_TXREHASH:
1992                 /* Paired with WRITE_ONCE() in sk_setsockopt() */
1993                 v.val = READ_ONCE(sk->sk_txrehash);
1994                 break;
1995
1996         default:
1997                 /* We implement the SO_SNDLOWAT etc to not be settable
1998                  * (1003.1g 7).
1999                  */
2000                 return -ENOPROTOOPT;
2001         }
2002
2003         if (len > lv)
2004                 len = lv;
2005         if (copy_to_sockptr(optval, &v, len))
2006                 return -EFAULT;
2007 lenout:
2008         if (copy_to_sockptr(optlen, &len, sizeof(int)))
2009                 return -EFAULT;
2010         return 0;
2011 }
2012
2013 /*
2014  * Initialize an sk_lock.
2015  *
2016  * (We also register the sk_lock with the lock validator.)
2017  */
2018 static inline void sock_lock_init(struct sock *sk)
2019 {
2020         if (sk->sk_kern_sock)
2021                 sock_lock_init_class_and_name(
2022                         sk,
2023                         af_family_kern_slock_key_strings[sk->sk_family],
2024                         af_family_kern_slock_keys + sk->sk_family,
2025                         af_family_kern_key_strings[sk->sk_family],
2026                         af_family_kern_keys + sk->sk_family);
2027         else
2028                 sock_lock_init_class_and_name(
2029                         sk,
2030                         af_family_slock_key_strings[sk->sk_family],
2031                         af_family_slock_keys + sk->sk_family,
2032                         af_family_key_strings[sk->sk_family],
2033                         af_family_keys + sk->sk_family);
2034 }
2035
2036 /*
2037  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2038  * even temporarly, because of RCU lookups. sk_node should also be left as is.
2039  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2040  */
2041 static void sock_copy(struct sock *nsk, const struct sock *osk)
2042 {
2043         const struct proto *prot = READ_ONCE(osk->sk_prot);
2044 #ifdef CONFIG_SECURITY_NETWORK
2045         void *sptr = nsk->sk_security;
2046 #endif
2047
2048         /* If we move sk_tx_queue_mapping out of the private section,
2049          * we must check if sk_tx_queue_clear() is called after
2050          * sock_copy() in sk_clone_lock().
2051          */
2052         BUILD_BUG_ON(offsetof(struct sock, sk_tx_queue_mapping) <
2053                      offsetof(struct sock, sk_dontcopy_begin) ||
2054                      offsetof(struct sock, sk_tx_queue_mapping) >=
2055                      offsetof(struct sock, sk_dontcopy_end));
2056
2057         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
2058
2059         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
2060                prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
2061
2062 #ifdef CONFIG_SECURITY_NETWORK
2063         nsk->sk_security = sptr;
2064         security_sk_clone(osk, nsk);
2065 #endif
2066 }
2067
2068 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
2069                 int family)
2070 {
2071         struct sock *sk;
2072         struct kmem_cache *slab;
2073
2074         slab = prot->slab;
2075         if (slab != NULL) {
2076                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
2077                 if (!sk)
2078                         return sk;
2079                 if (want_init_on_alloc(priority))
2080                         sk_prot_clear_nulls(sk, prot->obj_size);
2081         } else
2082                 sk = kmalloc(prot->obj_size, priority);
2083
2084         if (sk != NULL) {
2085                 if (security_sk_alloc(sk, family, priority))
2086                         goto out_free;
2087
2088                 if (!try_module_get(prot->owner))
2089                         goto out_free_sec;
2090         }
2091
2092         return sk;
2093
2094 out_free_sec:
2095         security_sk_free(sk);
2096 out_free:
2097         if (slab != NULL)
2098                 kmem_cache_free(slab, sk);
2099         else
2100                 kfree(sk);
2101         return NULL;
2102 }
2103
2104 static void sk_prot_free(struct proto *prot, struct sock *sk)
2105 {
2106         struct kmem_cache *slab;
2107         struct module *owner;
2108
2109         owner = prot->owner;
2110         slab = prot->slab;
2111
2112         cgroup_sk_free(&sk->sk_cgrp_data);
2113         mem_cgroup_sk_free(sk);
2114         security_sk_free(sk);
2115         if (slab != NULL)
2116                 kmem_cache_free(slab, sk);
2117         else
2118                 kfree(sk);
2119         module_put(owner);
2120 }
2121
2122 /**
2123  *      sk_alloc - All socket objects are allocated here
2124  *      @net: the applicable net namespace
2125  *      @family: protocol family
2126  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2127  *      @prot: struct proto associated with this new sock instance
2128  *      @kern: is this to be a kernel socket?
2129  */
2130 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
2131                       struct proto *prot, int kern)
2132 {
2133         struct sock *sk;
2134
2135         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
2136         if (sk) {
2137                 sk->sk_family = family;
2138                 /*
2139                  * See comment in struct sock definition to understand
2140                  * why we need sk_prot_creator -acme
2141                  */
2142                 sk->sk_prot = sk->sk_prot_creator = prot;
2143                 sk->sk_kern_sock = kern;
2144                 sock_lock_init(sk);
2145                 sk->sk_net_refcnt = kern ? 0 : 1;
2146                 if (likely(sk->sk_net_refcnt)) {
2147                         get_net_track(net, &sk->ns_tracker, priority);
2148                         sock_inuse_add(net, 1);
2149                 } else {
2150                         __netns_tracker_alloc(net, &sk->ns_tracker,
2151                                               false, priority);
2152                 }
2153
2154                 sock_net_set(sk, net);
2155                 refcount_set(&sk->sk_wmem_alloc, 1);
2156
2157                 mem_cgroup_sk_alloc(sk);
2158                 cgroup_sk_alloc(&sk->sk_cgrp_data);
2159                 sock_update_classid(&sk->sk_cgrp_data);
2160                 sock_update_netprioidx(&sk->sk_cgrp_data);
2161                 sk_tx_queue_clear(sk);
2162         }
2163
2164         return sk;
2165 }
2166 EXPORT_SYMBOL(sk_alloc);
2167
2168 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2169  * grace period. This is the case for UDP sockets and TCP listeners.
2170  */
2171 static void __sk_destruct(struct rcu_head *head)
2172 {
2173         struct sock *sk = container_of(head, struct sock, sk_rcu);
2174         struct sk_filter *filter;
2175
2176         if (sk->sk_destruct)
2177                 sk->sk_destruct(sk);
2178
2179         filter = rcu_dereference_check(sk->sk_filter,
2180                                        refcount_read(&sk->sk_wmem_alloc) == 0);
2181         if (filter) {
2182                 sk_filter_uncharge(sk, filter);
2183                 RCU_INIT_POINTER(sk->sk_filter, NULL);
2184         }
2185
2186         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
2187
2188 #ifdef CONFIG_BPF_SYSCALL
2189         bpf_sk_storage_free(sk);
2190 #endif
2191
2192         if (atomic_read(&sk->sk_omem_alloc))
2193                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2194                          __func__, atomic_read(&sk->sk_omem_alloc));
2195
2196         if (sk->sk_frag.page) {
2197                 put_page(sk->sk_frag.page);
2198                 sk->sk_frag.page = NULL;
2199         }
2200
2201         /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2202         put_cred(sk->sk_peer_cred);
2203         put_pid(sk->sk_peer_pid);
2204
2205         if (likely(sk->sk_net_refcnt))
2206                 put_net_track(sock_net(sk), &sk->ns_tracker);
2207         else
2208                 __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false);
2209
2210         sk_prot_free(sk->sk_prot_creator, sk);
2211 }
2212
2213 void sk_destruct(struct sock *sk)
2214 {
2215         bool use_call_rcu = sock_flag(sk, SOCK_RCU_FREE);
2216
2217         if (rcu_access_pointer(sk->sk_reuseport_cb)) {
2218                 reuseport_detach_sock(sk);
2219                 use_call_rcu = true;
2220         }
2221
2222         if (use_call_rcu)
2223                 call_rcu(&sk->sk_rcu, __sk_destruct);
2224         else
2225                 __sk_destruct(&sk->sk_rcu);
2226 }
2227
2228 static void __sk_free(struct sock *sk)
2229 {
2230         if (likely(sk->sk_net_refcnt))
2231                 sock_inuse_add(sock_net(sk), -1);
2232
2233         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
2234                 sock_diag_broadcast_destroy(sk);
2235         else
2236                 sk_destruct(sk);
2237 }
2238
2239 void sk_free(struct sock *sk)
2240 {
2241         /*
2242          * We subtract one from sk_wmem_alloc and can know if
2243          * some packets are still in some tx queue.
2244          * If not null, sock_wfree() will call __sk_free(sk) later
2245          */
2246         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
2247                 __sk_free(sk);
2248 }
2249 EXPORT_SYMBOL(sk_free);
2250
2251 static void sk_init_common(struct sock *sk)
2252 {
2253         skb_queue_head_init(&sk->sk_receive_queue);
2254         skb_queue_head_init(&sk->sk_write_queue);
2255         skb_queue_head_init(&sk->sk_error_queue);
2256
2257         rwlock_init(&sk->sk_callback_lock);
2258         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
2259                         af_rlock_keys + sk->sk_family,
2260                         af_family_rlock_key_strings[sk->sk_family]);
2261         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
2262                         af_wlock_keys + sk->sk_family,
2263                         af_family_wlock_key_strings[sk->sk_family]);
2264         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
2265                         af_elock_keys + sk->sk_family,
2266                         af_family_elock_key_strings[sk->sk_family]);
2267         lockdep_set_class_and_name(&sk->sk_callback_lock,
2268                         af_callback_keys + sk->sk_family,
2269                         af_family_clock_key_strings[sk->sk_family]);
2270 }
2271
2272 /**
2273  *      sk_clone_lock - clone a socket, and lock its clone
2274  *      @sk: the socket to clone
2275  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2276  *
2277  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2278  */
2279 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
2280 {
2281         struct proto *prot = READ_ONCE(sk->sk_prot);
2282         struct sk_filter *filter;
2283         bool is_charged = true;
2284         struct sock *newsk;
2285
2286         newsk = sk_prot_alloc(prot, priority, sk->sk_family);
2287         if (!newsk)
2288                 goto out;
2289
2290         sock_copy(newsk, sk);
2291
2292         newsk->sk_prot_creator = prot;
2293
2294         /* SANITY */
2295         if (likely(newsk->sk_net_refcnt)) {
2296                 get_net_track(sock_net(newsk), &newsk->ns_tracker, priority);
2297                 sock_inuse_add(sock_net(newsk), 1);
2298         } else {
2299                 /* Kernel sockets are not elevating the struct net refcount.
2300                  * Instead, use a tracker to more easily detect if a layer
2301                  * is not properly dismantling its kernel sockets at netns
2302                  * destroy time.
2303                  */
2304                 __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker,
2305                                       false, priority);
2306         }
2307         sk_node_init(&newsk->sk_node);
2308         sock_lock_init(newsk);
2309         bh_lock_sock(newsk);
2310         newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
2311         newsk->sk_backlog.len = 0;
2312
2313         atomic_set(&newsk->sk_rmem_alloc, 0);
2314
2315         /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2316         refcount_set(&newsk->sk_wmem_alloc, 1);
2317
2318         atomic_set(&newsk->sk_omem_alloc, 0);
2319         sk_init_common(newsk);
2320
2321         newsk->sk_dst_cache     = NULL;
2322         newsk->sk_dst_pending_confirm = 0;
2323         newsk->sk_wmem_queued   = 0;
2324         newsk->sk_forward_alloc = 0;
2325         newsk->sk_reserved_mem  = 0;
2326         atomic_set(&newsk->sk_drops, 0);
2327         newsk->sk_send_head     = NULL;
2328         newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
2329         atomic_set(&newsk->sk_zckey, 0);
2330
2331         sock_reset_flag(newsk, SOCK_DONE);
2332
2333         /* sk->sk_memcg will be populated at accept() time */
2334         newsk->sk_memcg = NULL;
2335
2336         cgroup_sk_clone(&newsk->sk_cgrp_data);
2337
2338         rcu_read_lock();
2339         filter = rcu_dereference(sk->sk_filter);
2340         if (filter != NULL)
2341                 /* though it's an empty new sock, the charging may fail
2342                  * if sysctl_optmem_max was changed between creation of
2343                  * original socket and cloning
2344                  */
2345                 is_charged = sk_filter_charge(newsk, filter);
2346         RCU_INIT_POINTER(newsk->sk_filter, filter);
2347         rcu_read_unlock();
2348
2349         if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
2350                 /* We need to make sure that we don't uncharge the new
2351                  * socket if we couldn't charge it in the first place
2352                  * as otherwise we uncharge the parent's filter.
2353                  */
2354                 if (!is_charged)
2355                         RCU_INIT_POINTER(newsk->sk_filter, NULL);
2356                 sk_free_unlock_clone(newsk);
2357                 newsk = NULL;
2358                 goto out;
2359         }
2360         RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
2361
2362         if (bpf_sk_storage_clone(sk, newsk)) {
2363                 sk_free_unlock_clone(newsk);
2364                 newsk = NULL;
2365                 goto out;
2366         }
2367
2368         /* Clear sk_user_data if parent had the pointer tagged
2369          * as not suitable for copying when cloning.
2370          */
2371         if (sk_user_data_is_nocopy(newsk))
2372                 newsk->sk_user_data = NULL;
2373
2374         newsk->sk_err      = 0;
2375         newsk->sk_err_soft = 0;
2376         newsk->sk_priority = 0;
2377         newsk->sk_incoming_cpu = raw_smp_processor_id();
2378
2379         /* Before updating sk_refcnt, we must commit prior changes to memory
2380          * (Documentation/RCU/rculist_nulls.rst for details)
2381          */
2382         smp_wmb();
2383         refcount_set(&newsk->sk_refcnt, 2);
2384
2385         sk_set_socket(newsk, NULL);
2386         sk_tx_queue_clear(newsk);
2387         RCU_INIT_POINTER(newsk->sk_wq, NULL);
2388
2389         if (newsk->sk_prot->sockets_allocated)
2390                 sk_sockets_allocated_inc(newsk);
2391
2392         if (sock_needs_netstamp(sk) && newsk->sk_flags & SK_FLAGS_TIMESTAMP)
2393                 net_enable_timestamp();
2394 out:
2395         return newsk;
2396 }
2397 EXPORT_SYMBOL_GPL(sk_clone_lock);
2398
2399 void sk_free_unlock_clone(struct sock *sk)
2400 {
2401         /* It is still raw copy of parent, so invalidate
2402          * destructor and make plain sk_free() */
2403         sk->sk_destruct = NULL;
2404         bh_unlock_sock(sk);
2405         sk_free(sk);
2406 }
2407 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
2408
2409 static u32 sk_dst_gso_max_size(struct sock *sk, struct dst_entry *dst)
2410 {
2411         bool is_ipv6 = false;
2412         u32 max_size;
2413
2414 #if IS_ENABLED(CONFIG_IPV6)
2415         is_ipv6 = (sk->sk_family == AF_INET6 &&
2416                    !ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr));
2417 #endif
2418         /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2419         max_size = is_ipv6 ? READ_ONCE(dst->dev->gso_max_size) :
2420                         READ_ONCE(dst->dev->gso_ipv4_max_size);
2421         if (max_size > GSO_LEGACY_MAX_SIZE && !sk_is_tcp(sk))
2422                 max_size = GSO_LEGACY_MAX_SIZE;
2423
2424         return max_size - (MAX_TCP_HEADER + 1);
2425 }
2426
2427 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
2428 {
2429         u32 max_segs = 1;
2430
2431         sk->sk_route_caps = dst->dev->features;
2432         if (sk_is_tcp(sk))
2433                 sk->sk_route_caps |= NETIF_F_GSO;
2434         if (sk->sk_route_caps & NETIF_F_GSO)
2435                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
2436         if (unlikely(sk->sk_gso_disabled))
2437                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2438         if (sk_can_gso(sk)) {
2439                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
2440                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
2441                 } else {
2442                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
2443                         sk->sk_gso_max_size = sk_dst_gso_max_size(sk, dst);
2444                         /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2445                         max_segs = max_t(u32, READ_ONCE(dst->dev->gso_max_segs), 1);
2446                 }
2447         }
2448         sk->sk_gso_max_segs = max_segs;
2449         sk_dst_set(sk, dst);
2450 }
2451 EXPORT_SYMBOL_GPL(sk_setup_caps);
2452
2453 /*
2454  *      Simple resource managers for sockets.
2455  */
2456
2457
2458 /*
2459  * Write buffer destructor automatically called from kfree_skb.
2460  */
2461 void sock_wfree(struct sk_buff *skb)
2462 {
2463         struct sock *sk = skb->sk;
2464         unsigned int len = skb->truesize;
2465         bool free;
2466
2467         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
2468                 if (sock_flag(sk, SOCK_RCU_FREE) &&
2469                     sk->sk_write_space == sock_def_write_space) {
2470                         rcu_read_lock();
2471                         free = refcount_sub_and_test(len, &sk->sk_wmem_alloc);
2472                         sock_def_write_space_wfree(sk);
2473                         rcu_read_unlock();
2474                         if (unlikely(free))
2475                                 __sk_free(sk);
2476                         return;
2477                 }
2478
2479                 /*
2480                  * Keep a reference on sk_wmem_alloc, this will be released
2481                  * after sk_write_space() call
2482                  */
2483                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
2484                 sk->sk_write_space(sk);
2485                 len = 1;
2486         }
2487         /*
2488          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2489          * could not do because of in-flight packets
2490          */
2491         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
2492                 __sk_free(sk);
2493 }
2494 EXPORT_SYMBOL(sock_wfree);
2495
2496 /* This variant of sock_wfree() is used by TCP,
2497  * since it sets SOCK_USE_WRITE_QUEUE.
2498  */
2499 void __sock_wfree(struct sk_buff *skb)
2500 {
2501         struct sock *sk = skb->sk;
2502
2503         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
2504                 __sk_free(sk);
2505 }
2506
2507 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
2508 {
2509         skb_orphan(skb);
2510         skb->sk = sk;
2511 #ifdef CONFIG_INET
2512         if (unlikely(!sk_fullsock(sk))) {
2513                 skb->destructor = sock_edemux;
2514                 sock_hold(sk);
2515                 return;
2516         }
2517 #endif
2518         skb->destructor = sock_wfree;
2519         skb_set_hash_from_sk(skb, sk);
2520         /*
2521          * We used to take a refcount on sk, but following operation
2522          * is enough to guarantee sk_free() wont free this sock until
2523          * all in-flight packets are completed
2524          */
2525         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
2526 }
2527 EXPORT_SYMBOL(skb_set_owner_w);
2528
2529 static bool can_skb_orphan_partial(const struct sk_buff *skb)
2530 {
2531 #ifdef CONFIG_TLS_DEVICE
2532         /* Drivers depend on in-order delivery for crypto offload,
2533          * partial orphan breaks out-of-order-OK logic.
2534          */
2535         if (skb->decrypted)
2536                 return false;
2537 #endif
2538         return (skb->destructor == sock_wfree ||
2539                 (IS_ENABLED(CONFIG_INET) && skb->destructor == tcp_wfree));
2540 }
2541
2542 /* This helper is used by netem, as it can hold packets in its
2543  * delay queue. We want to allow the owner socket to send more
2544  * packets, as if they were already TX completed by a typical driver.
2545  * But we also want to keep skb->sk set because some packet schedulers
2546  * rely on it (sch_fq for example).
2547  */
2548 void skb_orphan_partial(struct sk_buff *skb)
2549 {
2550         if (skb_is_tcp_pure_ack(skb))
2551                 return;
2552
2553         if (can_skb_orphan_partial(skb) && skb_set_owner_sk_safe(skb, skb->sk))
2554                 return;
2555
2556         skb_orphan(skb);
2557 }
2558 EXPORT_SYMBOL(skb_orphan_partial);
2559
2560 /*
2561  * Read buffer destructor automatically called from kfree_skb.
2562  */
2563 void sock_rfree(struct sk_buff *skb)
2564 {
2565         struct sock *sk = skb->sk;
2566         unsigned int len = skb->truesize;
2567
2568         atomic_sub(len, &sk->sk_rmem_alloc);
2569         sk_mem_uncharge(sk, len);
2570 }
2571 EXPORT_SYMBOL(sock_rfree);
2572
2573 /*
2574  * Buffer destructor for skbs that are not used directly in read or write
2575  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2576  */
2577 void sock_efree(struct sk_buff *skb)
2578 {
2579         sock_put(skb->sk);
2580 }
2581 EXPORT_SYMBOL(sock_efree);
2582
2583 /* Buffer destructor for prefetch/receive path where reference count may
2584  * not be held, e.g. for listen sockets.
2585  */
2586 #ifdef CONFIG_INET
2587 void sock_pfree(struct sk_buff *skb)
2588 {
2589         if (sk_is_refcounted(skb->sk))
2590                 sock_gen_put(skb->sk);
2591 }
2592 EXPORT_SYMBOL(sock_pfree);
2593 #endif /* CONFIG_INET */
2594
2595 kuid_t sock_i_uid(struct sock *sk)
2596 {
2597         kuid_t uid;
2598
2599         read_lock_bh(&sk->sk_callback_lock);
2600         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
2601         read_unlock_bh(&sk->sk_callback_lock);
2602         return uid;
2603 }
2604 EXPORT_SYMBOL(sock_i_uid);
2605
2606 unsigned long __sock_i_ino(struct sock *sk)
2607 {
2608         unsigned long ino;
2609
2610         read_lock(&sk->sk_callback_lock);
2611         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
2612         read_unlock(&sk->sk_callback_lock);
2613         return ino;
2614 }
2615 EXPORT_SYMBOL(__sock_i_ino);
2616
2617 unsigned long sock_i_ino(struct sock *sk)
2618 {
2619         unsigned long ino;
2620
2621         local_bh_disable();
2622         ino = __sock_i_ino(sk);
2623         local_bh_enable();
2624         return ino;
2625 }
2626 EXPORT_SYMBOL(sock_i_ino);
2627
2628 /*
2629  * Allocate a skb from the socket's send buffer.
2630  */
2631 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
2632                              gfp_t priority)
2633 {
2634         if (force ||
2635             refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf)) {
2636                 struct sk_buff *skb = alloc_skb(size, priority);
2637
2638                 if (skb) {
2639                         skb_set_owner_w(skb, sk);
2640                         return skb;
2641                 }
2642         }
2643         return NULL;
2644 }
2645 EXPORT_SYMBOL(sock_wmalloc);
2646
2647 static void sock_ofree(struct sk_buff *skb)
2648 {
2649         struct sock *sk = skb->sk;
2650
2651         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
2652 }
2653
2654 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
2655                              gfp_t priority)
2656 {
2657         struct sk_buff *skb;
2658
2659         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2660         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
2661             READ_ONCE(sysctl_optmem_max))
2662                 return NULL;
2663
2664         skb = alloc_skb(size, priority);
2665         if (!skb)
2666                 return NULL;
2667
2668         atomic_add(skb->truesize, &sk->sk_omem_alloc);
2669         skb->sk = sk;
2670         skb->destructor = sock_ofree;
2671         return skb;
2672 }
2673
2674 /*
2675  * Allocate a memory block from the socket's option memory buffer.
2676  */
2677 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2678 {
2679         int optmem_max = READ_ONCE(sysctl_optmem_max);
2680
2681         if ((unsigned int)size <= optmem_max &&
2682             atomic_read(&sk->sk_omem_alloc) + size < optmem_max) {
2683                 void *mem;
2684                 /* First do the add, to avoid the race if kmalloc
2685                  * might sleep.
2686                  */
2687                 atomic_add(size, &sk->sk_omem_alloc);
2688                 mem = kmalloc(size, priority);
2689                 if (mem)
2690                         return mem;
2691                 atomic_sub(size, &sk->sk_omem_alloc);
2692         }
2693         return NULL;
2694 }
2695 EXPORT_SYMBOL(sock_kmalloc);
2696
2697 /* Free an option memory block. Note, we actually want the inline
2698  * here as this allows gcc to detect the nullify and fold away the
2699  * condition entirely.
2700  */
2701 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2702                                   const bool nullify)
2703 {
2704         if (WARN_ON_ONCE(!mem))
2705                 return;
2706         if (nullify)
2707                 kfree_sensitive(mem);
2708         else
2709                 kfree(mem);
2710         atomic_sub(size, &sk->sk_omem_alloc);
2711 }
2712
2713 void sock_kfree_s(struct sock *sk, void *mem, int size)
2714 {
2715         __sock_kfree_s(sk, mem, size, false);
2716 }
2717 EXPORT_SYMBOL(sock_kfree_s);
2718
2719 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2720 {
2721         __sock_kfree_s(sk, mem, size, true);
2722 }
2723 EXPORT_SYMBOL(sock_kzfree_s);
2724
2725 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2726    I think, these locks should be removed for datagram sockets.
2727  */
2728 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2729 {
2730         DEFINE_WAIT(wait);
2731
2732         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2733         for (;;) {
2734                 if (!timeo)
2735                         break;
2736                 if (signal_pending(current))
2737                         break;
2738                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2739                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2740                 if (refcount_read(&sk->sk_wmem_alloc) < READ_ONCE(sk->sk_sndbuf))
2741                         break;
2742                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2743                         break;
2744                 if (READ_ONCE(sk->sk_err))
2745                         break;
2746                 timeo = schedule_timeout(timeo);
2747         }
2748         finish_wait(sk_sleep(sk), &wait);
2749         return timeo;
2750 }
2751
2752
2753 /*
2754  *      Generic send/receive buffer handlers
2755  */
2756
2757 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2758                                      unsigned long data_len, int noblock,
2759                                      int *errcode, int max_page_order)
2760 {
2761         struct sk_buff *skb;
2762         long timeo;
2763         int err;
2764
2765         timeo = sock_sndtimeo(sk, noblock);
2766         for (;;) {
2767                 err = sock_error(sk);
2768                 if (err != 0)
2769                         goto failure;
2770
2771                 err = -EPIPE;
2772                 if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN)
2773                         goto failure;
2774
2775                 if (sk_wmem_alloc_get(sk) < READ_ONCE(sk->sk_sndbuf))
2776                         break;
2777
2778                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2779                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2780                 err = -EAGAIN;
2781                 if (!timeo)
2782                         goto failure;
2783                 if (signal_pending(current))
2784                         goto interrupted;
2785                 timeo = sock_wait_for_wmem(sk, timeo);
2786         }
2787         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2788                                    errcode, sk->sk_allocation);
2789         if (skb)
2790                 skb_set_owner_w(skb, sk);
2791         return skb;
2792
2793 interrupted:
2794         err = sock_intr_errno(timeo);
2795 failure:
2796         *errcode = err;
2797         return NULL;
2798 }
2799 EXPORT_SYMBOL(sock_alloc_send_pskb);
2800
2801 int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg,
2802                      struct sockcm_cookie *sockc)
2803 {
2804         u32 tsflags;
2805
2806         switch (cmsg->cmsg_type) {
2807         case SO_MARK:
2808                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) &&
2809                     !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2810                         return -EPERM;
2811                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2812                         return -EINVAL;
2813                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2814                 break;
2815         case SO_TIMESTAMPING_OLD:
2816         case SO_TIMESTAMPING_NEW:
2817                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2818                         return -EINVAL;
2819
2820                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2821                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2822                         return -EINVAL;
2823
2824                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2825                 sockc->tsflags |= tsflags;
2826                 break;
2827         case SCM_TXTIME:
2828                 if (!sock_flag(sk, SOCK_TXTIME))
2829                         return -EINVAL;
2830                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u64)))
2831                         return -EINVAL;
2832                 sockc->transmit_time = get_unaligned((u64 *)CMSG_DATA(cmsg));
2833                 break;
2834         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2835         case SCM_RIGHTS:
2836         case SCM_CREDENTIALS:
2837                 break;
2838         default:
2839                 return -EINVAL;
2840         }
2841         return 0;
2842 }
2843 EXPORT_SYMBOL(__sock_cmsg_send);
2844
2845 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2846                    struct sockcm_cookie *sockc)
2847 {
2848         struct cmsghdr *cmsg;
2849         int ret;
2850
2851         for_each_cmsghdr(cmsg, msg) {
2852                 if (!CMSG_OK(msg, cmsg))
2853                         return -EINVAL;
2854                 if (cmsg->cmsg_level != SOL_SOCKET)
2855                         continue;
2856                 ret = __sock_cmsg_send(sk, cmsg, sockc);
2857                 if (ret)
2858                         return ret;
2859         }
2860         return 0;
2861 }
2862 EXPORT_SYMBOL(sock_cmsg_send);
2863
2864 static void sk_enter_memory_pressure(struct sock *sk)
2865 {
2866         if (!sk->sk_prot->enter_memory_pressure)
2867                 return;
2868
2869         sk->sk_prot->enter_memory_pressure(sk);
2870 }
2871
2872 static void sk_leave_memory_pressure(struct sock *sk)
2873 {
2874         if (sk->sk_prot->leave_memory_pressure) {
2875                 INDIRECT_CALL_INET_1(sk->sk_prot->leave_memory_pressure,
2876                                      tcp_leave_memory_pressure, sk);
2877         } else {
2878                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2879
2880                 if (memory_pressure && READ_ONCE(*memory_pressure))
2881                         WRITE_ONCE(*memory_pressure, 0);
2882         }
2883 }
2884
2885 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key);
2886
2887 /**
2888  * skb_page_frag_refill - check that a page_frag contains enough room
2889  * @sz: minimum size of the fragment we want to get
2890  * @pfrag: pointer to page_frag
2891  * @gfp: priority for memory allocation
2892  *
2893  * Note: While this allocator tries to use high order pages, there is
2894  * no guarantee that allocations succeed. Therefore, @sz MUST be
2895  * less or equal than PAGE_SIZE.
2896  */
2897 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2898 {
2899         if (pfrag->page) {
2900                 if (page_ref_count(pfrag->page) == 1) {
2901                         pfrag->offset = 0;
2902                         return true;
2903                 }
2904                 if (pfrag->offset + sz <= pfrag->size)
2905                         return true;
2906                 put_page(pfrag->page);
2907         }
2908
2909         pfrag->offset = 0;
2910         if (SKB_FRAG_PAGE_ORDER &&
2911             !static_branch_unlikely(&net_high_order_alloc_disable_key)) {
2912                 /* Avoid direct reclaim but allow kswapd to wake */
2913                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2914                                           __GFP_COMP | __GFP_NOWARN |
2915                                           __GFP_NORETRY,
2916                                           SKB_FRAG_PAGE_ORDER);
2917                 if (likely(pfrag->page)) {
2918                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2919                         return true;
2920                 }
2921         }
2922         pfrag->page = alloc_page(gfp);
2923         if (likely(pfrag->page)) {
2924                 pfrag->size = PAGE_SIZE;
2925                 return true;
2926         }
2927         return false;
2928 }
2929 EXPORT_SYMBOL(skb_page_frag_refill);
2930
2931 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2932 {
2933         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2934                 return true;
2935
2936         sk_enter_memory_pressure(sk);
2937         sk_stream_moderate_sndbuf(sk);
2938         return false;
2939 }
2940 EXPORT_SYMBOL(sk_page_frag_refill);
2941
2942 void __lock_sock(struct sock *sk)
2943         __releases(&sk->sk_lock.slock)
2944         __acquires(&sk->sk_lock.slock)
2945 {
2946         DEFINE_WAIT(wait);
2947
2948         for (;;) {
2949                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2950                                         TASK_UNINTERRUPTIBLE);
2951                 spin_unlock_bh(&sk->sk_lock.slock);
2952                 schedule();
2953                 spin_lock_bh(&sk->sk_lock.slock);
2954                 if (!sock_owned_by_user(sk))
2955                         break;
2956         }
2957         finish_wait(&sk->sk_lock.wq, &wait);
2958 }
2959
2960 void __release_sock(struct sock *sk)
2961         __releases(&sk->sk_lock.slock)
2962         __acquires(&sk->sk_lock.slock)
2963 {
2964         struct sk_buff *skb, *next;
2965
2966         while ((skb = sk->sk_backlog.head) != NULL) {
2967                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2968
2969                 spin_unlock_bh(&sk->sk_lock.slock);
2970
2971                 do {
2972                         next = skb->next;
2973                         prefetch(next);
2974                         DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb));
2975                         skb_mark_not_on_list(skb);
2976                         sk_backlog_rcv(sk, skb);
2977
2978                         cond_resched();
2979
2980                         skb = next;
2981                 } while (skb != NULL);
2982
2983                 spin_lock_bh(&sk->sk_lock.slock);
2984         }
2985
2986         /*
2987          * Doing the zeroing here guarantee we can not loop forever
2988          * while a wild producer attempts to flood us.
2989          */
2990         sk->sk_backlog.len = 0;
2991 }
2992
2993 void __sk_flush_backlog(struct sock *sk)
2994 {
2995         spin_lock_bh(&sk->sk_lock.slock);
2996         __release_sock(sk);
2997
2998         if (sk->sk_prot->release_cb)
2999                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3000                                      tcp_release_cb, sk);
3001
3002         spin_unlock_bh(&sk->sk_lock.slock);
3003 }
3004 EXPORT_SYMBOL_GPL(__sk_flush_backlog);
3005
3006 /**
3007  * sk_wait_data - wait for data to arrive at sk_receive_queue
3008  * @sk:    sock to wait on
3009  * @timeo: for how long
3010  * @skb:   last skb seen on sk_receive_queue
3011  *
3012  * Now socket state including sk->sk_err is changed only under lock,
3013  * hence we may omit checks after joining wait queue.
3014  * We check receive queue before schedule() only as optimization;
3015  * it is very likely that release_sock() added new data.
3016  */
3017 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
3018 {
3019         DEFINE_WAIT_FUNC(wait, woken_wake_function);
3020         int rc;
3021
3022         add_wait_queue(sk_sleep(sk), &wait);
3023         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3024         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
3025         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
3026         remove_wait_queue(sk_sleep(sk), &wait);
3027         return rc;
3028 }
3029 EXPORT_SYMBOL(sk_wait_data);
3030
3031 /**
3032  *      __sk_mem_raise_allocated - increase memory_allocated
3033  *      @sk: socket
3034  *      @size: memory size to allocate
3035  *      @amt: pages to allocate
3036  *      @kind: allocation type
3037  *
3038  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3039  *
3040  *      Unlike the globally shared limits among the sockets under same protocol,
3041  *      consuming the budget of a memcg won't have direct effect on other ones.
3042  *      So be optimistic about memcg's tolerance, and leave the callers to decide
3043  *      whether or not to raise allocated through sk_under_memory_pressure() or
3044  *      its variants.
3045  */
3046 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
3047 {
3048         struct mem_cgroup *memcg = mem_cgroup_sockets_enabled ? sk->sk_memcg : NULL;
3049         struct proto *prot = sk->sk_prot;
3050         bool charged = false;
3051         long allocated;
3052
3053         sk_memory_allocated_add(sk, amt);
3054         allocated = sk_memory_allocated(sk);
3055
3056         if (memcg) {
3057                 if (!mem_cgroup_charge_skmem(memcg, amt, gfp_memcg_charge()))
3058                         goto suppress_allocation;
3059                 charged = true;
3060         }
3061
3062         /* Under limit. */
3063         if (allocated <= sk_prot_mem_limits(sk, 0)) {
3064                 sk_leave_memory_pressure(sk);
3065                 return 1;
3066         }
3067
3068         /* Under pressure. */
3069         if (allocated > sk_prot_mem_limits(sk, 1))
3070                 sk_enter_memory_pressure(sk);
3071
3072         /* Over hard limit. */
3073         if (allocated > sk_prot_mem_limits(sk, 2))
3074                 goto suppress_allocation;
3075
3076         /* Guarantee minimum buffer size under pressure (either global
3077          * or memcg) to make sure features described in RFC 7323 (TCP
3078          * Extensions for High Performance) work properly.
3079          *
3080          * This rule does NOT stand when exceeds global or memcg's hard
3081          * limit, or else a DoS attack can be taken place by spawning
3082          * lots of sockets whose usage are under minimum buffer size.
3083          */
3084         if (kind == SK_MEM_RECV) {
3085                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
3086                         return 1;
3087
3088         } else { /* SK_MEM_SEND */
3089                 int wmem0 = sk_get_wmem0(sk, prot);
3090
3091                 if (sk->sk_type == SOCK_STREAM) {
3092                         if (sk->sk_wmem_queued < wmem0)
3093                                 return 1;
3094                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
3095                                 return 1;
3096                 }
3097         }
3098
3099         if (sk_has_memory_pressure(sk)) {
3100                 u64 alloc;
3101
3102                 /* The following 'average' heuristic is within the
3103                  * scope of global accounting, so it only makes
3104                  * sense for global memory pressure.
3105                  */
3106                 if (!sk_under_global_memory_pressure(sk))
3107                         return 1;
3108
3109                 /* Try to be fair among all the sockets under global
3110                  * pressure by allowing the ones that below average
3111                  * usage to raise.
3112                  */
3113                 alloc = sk_sockets_allocated_read_positive(sk);
3114                 if (sk_prot_mem_limits(sk, 2) > alloc *
3115                     sk_mem_pages(sk->sk_wmem_queued +
3116                                  atomic_read(&sk->sk_rmem_alloc) +
3117                                  sk->sk_forward_alloc))
3118                         return 1;
3119         }
3120
3121 suppress_allocation:
3122
3123         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
3124                 sk_stream_moderate_sndbuf(sk);
3125
3126                 /* Fail only if socket is _under_ its sndbuf.
3127                  * In this case we cannot block, so that we have to fail.
3128                  */
3129                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) {
3130                         /* Force charge with __GFP_NOFAIL */
3131                         if (memcg && !charged) {
3132                                 mem_cgroup_charge_skmem(memcg, amt,
3133                                         gfp_memcg_charge() | __GFP_NOFAIL);
3134                         }
3135                         return 1;
3136                 }
3137         }
3138
3139         if (kind == SK_MEM_SEND || (kind == SK_MEM_RECV && charged))
3140                 trace_sock_exceed_buf_limit(sk, prot, allocated, kind);
3141
3142         sk_memory_allocated_sub(sk, amt);
3143
3144         if (charged)
3145                 mem_cgroup_uncharge_skmem(memcg, amt);
3146
3147         return 0;
3148 }
3149
3150 /**
3151  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3152  *      @sk: socket
3153  *      @size: memory size to allocate
3154  *      @kind: allocation type
3155  *
3156  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3157  *      rmem allocation. This function assumes that protocols which have
3158  *      memory_pressure use sk_wmem_queued as write buffer accounting.
3159  */
3160 int __sk_mem_schedule(struct sock *sk, int size, int kind)
3161 {
3162         int ret, amt = sk_mem_pages(size);
3163
3164         sk_forward_alloc_add(sk, amt << PAGE_SHIFT);
3165         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
3166         if (!ret)
3167                 sk_forward_alloc_add(sk, -(amt << PAGE_SHIFT));
3168         return ret;
3169 }
3170 EXPORT_SYMBOL(__sk_mem_schedule);
3171
3172 /**
3173  *      __sk_mem_reduce_allocated - reclaim memory_allocated
3174  *      @sk: socket
3175  *      @amount: number of quanta
3176  *
3177  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3178  */
3179 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
3180 {
3181         sk_memory_allocated_sub(sk, amount);
3182
3183         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
3184                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
3185
3186         if (sk_under_global_memory_pressure(sk) &&
3187             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
3188                 sk_leave_memory_pressure(sk);
3189 }
3190
3191 /**
3192  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3193  *      @sk: socket
3194  *      @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3195  */
3196 void __sk_mem_reclaim(struct sock *sk, int amount)
3197 {
3198         amount >>= PAGE_SHIFT;
3199         sk_forward_alloc_add(sk, -(amount << PAGE_SHIFT));
3200         __sk_mem_reduce_allocated(sk, amount);
3201 }
3202 EXPORT_SYMBOL(__sk_mem_reclaim);
3203
3204 int sk_set_peek_off(struct sock *sk, int val)
3205 {
3206         WRITE_ONCE(sk->sk_peek_off, val);
3207         return 0;
3208 }
3209 EXPORT_SYMBOL_GPL(sk_set_peek_off);
3210
3211 /*
3212  * Set of default routines for initialising struct proto_ops when
3213  * the protocol does not support a particular function. In certain
3214  * cases where it makes no sense for a protocol to have a "do nothing"
3215  * function, some default processing is provided.
3216  */
3217
3218 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
3219 {
3220         return -EOPNOTSUPP;
3221 }
3222 EXPORT_SYMBOL(sock_no_bind);
3223
3224 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
3225                     int len, int flags)
3226 {
3227         return -EOPNOTSUPP;
3228 }
3229 EXPORT_SYMBOL(sock_no_connect);
3230
3231 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
3232 {
3233         return -EOPNOTSUPP;
3234 }
3235 EXPORT_SYMBOL(sock_no_socketpair);
3236
3237 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
3238                    bool kern)
3239 {
3240         return -EOPNOTSUPP;
3241 }
3242 EXPORT_SYMBOL(sock_no_accept);
3243
3244 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
3245                     int peer)
3246 {
3247         return -EOPNOTSUPP;
3248 }
3249 EXPORT_SYMBOL(sock_no_getname);
3250
3251 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
3252 {
3253         return -EOPNOTSUPP;
3254 }
3255 EXPORT_SYMBOL(sock_no_ioctl);
3256
3257 int sock_no_listen(struct socket *sock, int backlog)
3258 {
3259         return -EOPNOTSUPP;
3260 }
3261 EXPORT_SYMBOL(sock_no_listen);
3262
3263 int sock_no_shutdown(struct socket *sock, int how)
3264 {
3265         return -EOPNOTSUPP;
3266 }
3267 EXPORT_SYMBOL(sock_no_shutdown);
3268
3269 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
3270 {
3271         return -EOPNOTSUPP;
3272 }
3273 EXPORT_SYMBOL(sock_no_sendmsg);
3274
3275 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
3276 {
3277         return -EOPNOTSUPP;
3278 }
3279 EXPORT_SYMBOL(sock_no_sendmsg_locked);
3280
3281 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
3282                     int flags)
3283 {
3284         return -EOPNOTSUPP;
3285 }
3286 EXPORT_SYMBOL(sock_no_recvmsg);
3287
3288 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
3289 {
3290         /* Mirror missing mmap method error code */
3291         return -ENODEV;
3292 }
3293 EXPORT_SYMBOL(sock_no_mmap);
3294
3295 /*
3296  * When a file is received (via SCM_RIGHTS, etc), we must bump the
3297  * various sock-based usage counts.
3298  */
3299 void __receive_sock(struct file *file)
3300 {
3301         struct socket *sock;
3302
3303         sock = sock_from_file(file);
3304         if (sock) {
3305                 sock_update_netprioidx(&sock->sk->sk_cgrp_data);
3306                 sock_update_classid(&sock->sk->sk_cgrp_data);
3307         }
3308 }
3309
3310 /*
3311  *      Default Socket Callbacks
3312  */
3313
3314 static void sock_def_wakeup(struct sock *sk)
3315 {
3316         struct socket_wq *wq;
3317
3318         rcu_read_lock();
3319         wq = rcu_dereference(sk->sk_wq);
3320         if (skwq_has_sleeper(wq))
3321                 wake_up_interruptible_all(&wq->wait);
3322         rcu_read_unlock();
3323 }
3324
3325 static void sock_def_error_report(struct sock *sk)
3326 {
3327         struct socket_wq *wq;
3328
3329         rcu_read_lock();
3330         wq = rcu_dereference(sk->sk_wq);
3331         if (skwq_has_sleeper(wq))
3332                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
3333         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
3334         rcu_read_unlock();
3335 }
3336
3337 void sock_def_readable(struct sock *sk)
3338 {
3339         struct socket_wq *wq;
3340
3341         trace_sk_data_ready(sk);
3342
3343         rcu_read_lock();
3344         wq = rcu_dereference(sk->sk_wq);
3345         if (skwq_has_sleeper(wq))
3346                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
3347                                                 EPOLLRDNORM | EPOLLRDBAND);
3348         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
3349         rcu_read_unlock();
3350 }
3351
3352 static void sock_def_write_space(struct sock *sk)
3353 {
3354         struct socket_wq *wq;
3355
3356         rcu_read_lock();
3357
3358         /* Do not wake up a writer until he can make "significant"
3359          * progress.  --DaveM
3360          */
3361         if (sock_writeable(sk)) {
3362                 wq = rcu_dereference(sk->sk_wq);
3363                 if (skwq_has_sleeper(wq))
3364                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3365                                                 EPOLLWRNORM | EPOLLWRBAND);
3366
3367                 /* Should agree with poll, otherwise some programs break */
3368                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3369         }
3370
3371         rcu_read_unlock();
3372 }
3373
3374 /* An optimised version of sock_def_write_space(), should only be called
3375  * for SOCK_RCU_FREE sockets under RCU read section and after putting
3376  * ->sk_wmem_alloc.
3377  */
3378 static void sock_def_write_space_wfree(struct sock *sk)
3379 {
3380         /* Do not wake up a writer until he can make "significant"
3381          * progress.  --DaveM
3382          */
3383         if (sock_writeable(sk)) {
3384                 struct socket_wq *wq = rcu_dereference(sk->sk_wq);
3385
3386                 /* rely on refcount_sub from sock_wfree() */
3387                 smp_mb__after_atomic();
3388                 if (wq && waitqueue_active(&wq->wait))
3389                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
3390                                                 EPOLLWRNORM | EPOLLWRBAND);
3391
3392                 /* Should agree with poll, otherwise some programs break */
3393                 sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
3394         }
3395 }
3396
3397 static void sock_def_destruct(struct sock *sk)
3398 {
3399 }
3400
3401 void sk_send_sigurg(struct sock *sk)
3402 {
3403         if (sk->sk_socket && sk->sk_socket->file)
3404                 if (send_sigurg(&sk->sk_socket->file->f_owner))
3405                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
3406 }
3407 EXPORT_SYMBOL(sk_send_sigurg);
3408
3409 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
3410                     unsigned long expires)
3411 {
3412         if (!mod_timer(timer, expires))
3413                 sock_hold(sk);
3414 }
3415 EXPORT_SYMBOL(sk_reset_timer);
3416
3417 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
3418 {
3419         if (del_timer(timer))
3420                 __sock_put(sk);
3421 }
3422 EXPORT_SYMBOL(sk_stop_timer);
3423
3424 void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer)
3425 {
3426         if (del_timer_sync(timer))
3427                 __sock_put(sk);
3428 }
3429 EXPORT_SYMBOL(sk_stop_timer_sync);
3430
3431 void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid)
3432 {
3433         sk_init_common(sk);
3434         sk->sk_send_head        =       NULL;
3435
3436         timer_setup(&sk->sk_timer, NULL, 0);
3437
3438         sk->sk_allocation       =       GFP_KERNEL;
3439         sk->sk_rcvbuf           =       READ_ONCE(sysctl_rmem_default);
3440         sk->sk_sndbuf           =       READ_ONCE(sysctl_wmem_default);
3441         sk->sk_state            =       TCP_CLOSE;
3442         sk->sk_use_task_frag    =       true;
3443         sk_set_socket(sk, sock);
3444
3445         sock_set_flag(sk, SOCK_ZAPPED);
3446
3447         if (sock) {
3448                 sk->sk_type     =       sock->type;
3449                 RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
3450                 sock->sk        =       sk;
3451         } else {
3452                 RCU_INIT_POINTER(sk->sk_wq, NULL);
3453         }
3454         sk->sk_uid      =       uid;
3455
3456         rwlock_init(&sk->sk_callback_lock);
3457         if (sk->sk_kern_sock)
3458                 lockdep_set_class_and_name(
3459                         &sk->sk_callback_lock,
3460                         af_kern_callback_keys + sk->sk_family,
3461                         af_family_kern_clock_key_strings[sk->sk_family]);
3462         else
3463                 lockdep_set_class_and_name(
3464                         &sk->sk_callback_lock,
3465                         af_callback_keys + sk->sk_family,
3466                         af_family_clock_key_strings[sk->sk_family]);
3467
3468         sk->sk_state_change     =       sock_def_wakeup;
3469         sk->sk_data_ready       =       sock_def_readable;
3470         sk->sk_write_space      =       sock_def_write_space;
3471         sk->sk_error_report     =       sock_def_error_report;
3472         sk->sk_destruct         =       sock_def_destruct;
3473
3474         sk->sk_frag.page        =       NULL;
3475         sk->sk_frag.offset      =       0;
3476         sk->sk_peek_off         =       -1;
3477
3478         sk->sk_peer_pid         =       NULL;
3479         sk->sk_peer_cred        =       NULL;
3480         spin_lock_init(&sk->sk_peer_lock);
3481
3482         sk->sk_write_pending    =       0;
3483         sk->sk_rcvlowat         =       1;
3484         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
3485         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
3486
3487         sk->sk_stamp = SK_DEFAULT_STAMP;
3488 #if BITS_PER_LONG==32
3489         seqlock_init(&sk->sk_stamp_seq);
3490 #endif
3491         atomic_set(&sk->sk_zckey, 0);
3492
3493 #ifdef CONFIG_NET_RX_BUSY_POLL
3494         sk->sk_napi_id          =       0;
3495         sk->sk_ll_usec          =       READ_ONCE(sysctl_net_busy_read);
3496 #endif
3497
3498         sk->sk_max_pacing_rate = ~0UL;
3499         sk->sk_pacing_rate = ~0UL;
3500         WRITE_ONCE(sk->sk_pacing_shift, 10);
3501         sk->sk_incoming_cpu = -1;
3502
3503         sk_rx_queue_clear(sk);
3504         /*
3505          * Before updating sk_refcnt, we must commit prior changes to memory
3506          * (Documentation/RCU/rculist_nulls.rst for details)
3507          */
3508         smp_wmb();
3509         refcount_set(&sk->sk_refcnt, 1);
3510         atomic_set(&sk->sk_drops, 0);
3511 }
3512 EXPORT_SYMBOL(sock_init_data_uid);
3513
3514 void sock_init_data(struct socket *sock, struct sock *sk)
3515 {
3516         kuid_t uid = sock ?
3517                 SOCK_INODE(sock)->i_uid :
3518                 make_kuid(sock_net(sk)->user_ns, 0);
3519
3520         sock_init_data_uid(sock, sk, uid);
3521 }
3522 EXPORT_SYMBOL(sock_init_data);
3523
3524 void lock_sock_nested(struct sock *sk, int subclass)
3525 {
3526         /* The sk_lock has mutex_lock() semantics here. */
3527         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
3528
3529         might_sleep();
3530         spin_lock_bh(&sk->sk_lock.slock);
3531         if (sock_owned_by_user_nocheck(sk))
3532                 __lock_sock(sk);
3533         sk->sk_lock.owned = 1;
3534         spin_unlock_bh(&sk->sk_lock.slock);
3535 }
3536 EXPORT_SYMBOL(lock_sock_nested);
3537
3538 void release_sock(struct sock *sk)
3539 {
3540         spin_lock_bh(&sk->sk_lock.slock);
3541         if (sk->sk_backlog.tail)
3542                 __release_sock(sk);
3543
3544         if (sk->sk_prot->release_cb)
3545                 INDIRECT_CALL_INET_1(sk->sk_prot->release_cb,
3546                                      tcp_release_cb, sk);
3547
3548         sock_release_ownership(sk);
3549         if (waitqueue_active(&sk->sk_lock.wq))
3550                 wake_up(&sk->sk_lock.wq);
3551         spin_unlock_bh(&sk->sk_lock.slock);
3552 }
3553 EXPORT_SYMBOL(release_sock);
3554
3555 bool __lock_sock_fast(struct sock *sk) __acquires(&sk->sk_lock.slock)
3556 {
3557         might_sleep();
3558         spin_lock_bh(&sk->sk_lock.slock);
3559
3560         if (!sock_owned_by_user_nocheck(sk)) {
3561                 /*
3562                  * Fast path return with bottom halves disabled and
3563                  * sock::sk_lock.slock held.
3564                  *
3565                  * The 'mutex' is not contended and holding
3566                  * sock::sk_lock.slock prevents all other lockers to
3567                  * proceed so the corresponding unlock_sock_fast() can
3568                  * avoid the slow path of release_sock() completely and
3569                  * just release slock.
3570                  *
3571                  * From a semantical POV this is equivalent to 'acquiring'
3572                  * the 'mutex', hence the corresponding lockdep
3573                  * mutex_release() has to happen in the fast path of
3574                  * unlock_sock_fast().
3575                  */
3576                 return false;
3577         }
3578
3579         __lock_sock(sk);
3580         sk->sk_lock.owned = 1;
3581         __acquire(&sk->sk_lock.slock);
3582         spin_unlock_bh(&sk->sk_lock.slock);
3583         return true;
3584 }
3585 EXPORT_SYMBOL(__lock_sock_fast);
3586
3587 int sock_gettstamp(struct socket *sock, void __user *userstamp,
3588                    bool timeval, bool time32)
3589 {
3590         struct sock *sk = sock->sk;
3591         struct timespec64 ts;
3592
3593         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
3594         ts = ktime_to_timespec64(sock_read_timestamp(sk));
3595         if (ts.tv_sec == -1)
3596                 return -ENOENT;
3597         if (ts.tv_sec == 0) {
3598                 ktime_t kt = ktime_get_real();
3599                 sock_write_timestamp(sk, kt);
3600                 ts = ktime_to_timespec64(kt);
3601         }
3602
3603         if (timeval)
3604                 ts.tv_nsec /= 1000;
3605
3606 #ifdef CONFIG_COMPAT_32BIT_TIME
3607         if (time32)
3608                 return put_old_timespec32(&ts, userstamp);
3609 #endif
3610 #ifdef CONFIG_SPARC64
3611         /* beware of padding in sparc64 timeval */
3612         if (timeval && !in_compat_syscall()) {
3613                 struct __kernel_old_timeval __user tv = {
3614                         .tv_sec = ts.tv_sec,
3615                         .tv_usec = ts.tv_nsec,
3616                 };
3617                 if (copy_to_user(userstamp, &tv, sizeof(tv)))
3618                         return -EFAULT;
3619                 return 0;
3620         }
3621 #endif
3622         return put_timespec64(&ts, userstamp);
3623 }
3624 EXPORT_SYMBOL(sock_gettstamp);
3625
3626 void sock_enable_timestamp(struct sock *sk, enum sock_flags flag)
3627 {
3628         if (!sock_flag(sk, flag)) {
3629                 unsigned long previous_flags = sk->sk_flags;
3630
3631                 sock_set_flag(sk, flag);
3632                 /*
3633                  * we just set one of the two flags which require net
3634                  * time stamping, but time stamping might have been on
3635                  * already because of the other one
3636                  */
3637                 if (sock_needs_netstamp(sk) &&
3638                     !(previous_flags & SK_FLAGS_TIMESTAMP))
3639                         net_enable_timestamp();
3640         }
3641 }
3642
3643 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
3644                        int level, int type)
3645 {
3646         struct sock_exterr_skb *serr;
3647         struct sk_buff *skb;
3648         int copied, err;
3649
3650         err = -EAGAIN;
3651         skb = sock_dequeue_err_skb(sk);
3652         if (skb == NULL)
3653                 goto out;
3654
3655         copied = skb->len;
3656         if (copied > len) {
3657                 msg->msg_flags |= MSG_TRUNC;
3658                 copied = len;
3659         }
3660         err = skb_copy_datagram_msg(skb, 0, msg, copied);
3661         if (err)
3662                 goto out_free_skb;
3663
3664         sock_recv_timestamp(msg, sk, skb);
3665
3666         serr = SKB_EXT_ERR(skb);
3667         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
3668
3669         msg->msg_flags |= MSG_ERRQUEUE;
3670         err = copied;
3671
3672 out_free_skb:
3673         kfree_skb(skb);
3674 out:
3675         return err;
3676 }
3677 EXPORT_SYMBOL(sock_recv_errqueue);
3678
3679 /*
3680  *      Get a socket option on an socket.
3681  *
3682  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3683  *      asynchronous errors should be reported by getsockopt. We assume
3684  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3685  */
3686 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3687                            char __user *optval, int __user *optlen)
3688 {
3689         struct sock *sk = sock->sk;
3690
3691         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3692         return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen);
3693 }
3694 EXPORT_SYMBOL(sock_common_getsockopt);
3695
3696 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3697                         int flags)
3698 {
3699         struct sock *sk = sock->sk;
3700         int addr_len = 0;
3701         int err;
3702
3703         err = sk->sk_prot->recvmsg(sk, msg, size, flags, &addr_len);
3704         if (err >= 0)
3705                 msg->msg_namelen = addr_len;
3706         return err;
3707 }
3708 EXPORT_SYMBOL(sock_common_recvmsg);
3709
3710 /*
3711  *      Set socket options on an inet socket.
3712  */
3713 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3714                            sockptr_t optval, unsigned int optlen)
3715 {
3716         struct sock *sk = sock->sk;
3717
3718         /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3719         return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen);
3720 }
3721 EXPORT_SYMBOL(sock_common_setsockopt);
3722
3723 void sk_common_release(struct sock *sk)
3724 {
3725         if (sk->sk_prot->destroy)
3726                 sk->sk_prot->destroy(sk);
3727
3728         /*
3729          * Observation: when sk_common_release is called, processes have
3730          * no access to socket. But net still has.
3731          * Step one, detach it from networking:
3732          *
3733          * A. Remove from hash tables.
3734          */
3735
3736         sk->sk_prot->unhash(sk);
3737
3738         /*
3739          * In this point socket cannot receive new packets, but it is possible
3740          * that some packets are in flight because some CPU runs receiver and
3741          * did hash table lookup before we unhashed socket. They will achieve
3742          * receive queue and will be purged by socket destructor.
3743          *
3744          * Also we still have packets pending on receive queue and probably,
3745          * our own packets waiting in device queues. sock_destroy will drain
3746          * receive queue, but transmitted packets will delay socket destruction
3747          * until the last reference will be released.
3748          */
3749
3750         sock_orphan(sk);
3751
3752         xfrm_sk_free_policy(sk);
3753
3754         sock_put(sk);
3755 }
3756 EXPORT_SYMBOL(sk_common_release);
3757
3758 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3759 {
3760         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3761
3762         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3763         mem[SK_MEMINFO_RCVBUF] = READ_ONCE(sk->sk_rcvbuf);
3764         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3765         mem[SK_MEMINFO_SNDBUF] = READ_ONCE(sk->sk_sndbuf);
3766         mem[SK_MEMINFO_FWD_ALLOC] = sk_forward_alloc_get(sk);
3767         mem[SK_MEMINFO_WMEM_QUEUED] = READ_ONCE(sk->sk_wmem_queued);
3768         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3769         mem[SK_MEMINFO_BACKLOG] = READ_ONCE(sk->sk_backlog.len);
3770         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3771 }
3772
3773 #ifdef CONFIG_PROC_FS
3774 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3775
3776 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3777 {
3778         int cpu, idx = prot->inuse_idx;
3779         int res = 0;
3780
3781         for_each_possible_cpu(cpu)
3782                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3783
3784         return res >= 0 ? res : 0;
3785 }
3786 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3787
3788 int sock_inuse_get(struct net *net)
3789 {
3790         int cpu, res = 0;
3791
3792         for_each_possible_cpu(cpu)
3793                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->all;
3794
3795         return res;
3796 }
3797
3798 EXPORT_SYMBOL_GPL(sock_inuse_get);
3799
3800 static int __net_init sock_inuse_init_net(struct net *net)
3801 {
3802         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3803         if (net->core.prot_inuse == NULL)
3804                 return -ENOMEM;
3805         return 0;
3806 }
3807
3808 static void __net_exit sock_inuse_exit_net(struct net *net)
3809 {
3810         free_percpu(net->core.prot_inuse);
3811 }
3812
3813 static struct pernet_operations net_inuse_ops = {
3814         .init = sock_inuse_init_net,
3815         .exit = sock_inuse_exit_net,
3816 };
3817
3818 static __init int net_inuse_init(void)
3819 {
3820         if (register_pernet_subsys(&net_inuse_ops))
3821                 panic("Cannot initialize net inuse counters");
3822
3823         return 0;
3824 }
3825
3826 core_initcall(net_inuse_init);
3827
3828 static int assign_proto_idx(struct proto *prot)
3829 {
3830         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3831
3832         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3833                 pr_err("PROTO_INUSE_NR exhausted\n");
3834                 return -ENOSPC;
3835         }
3836
3837         set_bit(prot->inuse_idx, proto_inuse_idx);
3838         return 0;
3839 }
3840
3841 static void release_proto_idx(struct proto *prot)
3842 {
3843         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3844                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3845 }
3846 #else
3847 static inline int assign_proto_idx(struct proto *prot)
3848 {
3849         return 0;
3850 }
3851
3852 static inline void release_proto_idx(struct proto *prot)
3853 {
3854 }
3855
3856 #endif
3857
3858 static void tw_prot_cleanup(struct timewait_sock_ops *twsk_prot)
3859 {
3860         if (!twsk_prot)
3861                 return;
3862         kfree(twsk_prot->twsk_slab_name);
3863         twsk_prot->twsk_slab_name = NULL;
3864         kmem_cache_destroy(twsk_prot->twsk_slab);
3865         twsk_prot->twsk_slab = NULL;
3866 }
3867
3868 static int tw_prot_init(const struct proto *prot)
3869 {
3870         struct timewait_sock_ops *twsk_prot = prot->twsk_prot;
3871
3872         if (!twsk_prot)
3873                 return 0;
3874
3875         twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s",
3876                                               prot->name);
3877         if (!twsk_prot->twsk_slab_name)
3878                 return -ENOMEM;
3879
3880         twsk_prot->twsk_slab =
3881                 kmem_cache_create(twsk_prot->twsk_slab_name,
3882                                   twsk_prot->twsk_obj_size, 0,
3883                                   SLAB_ACCOUNT | prot->slab_flags,
3884                                   NULL);
3885         if (!twsk_prot->twsk_slab) {
3886                 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3887                         prot->name);
3888                 return -ENOMEM;
3889         }
3890
3891         return 0;
3892 }
3893
3894 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3895 {
3896         if (!rsk_prot)
3897                 return;
3898         kfree(rsk_prot->slab_name);
3899         rsk_prot->slab_name = NULL;
3900         kmem_cache_destroy(rsk_prot->slab);
3901         rsk_prot->slab = NULL;
3902 }
3903
3904 static int req_prot_init(const struct proto *prot)
3905 {
3906         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3907
3908         if (!rsk_prot)
3909                 return 0;
3910
3911         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3912                                         prot->name);
3913         if (!rsk_prot->slab_name)
3914                 return -ENOMEM;
3915
3916         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3917                                            rsk_prot->obj_size, 0,
3918                                            SLAB_ACCOUNT | prot->slab_flags,
3919                                            NULL);
3920
3921         if (!rsk_prot->slab) {
3922                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3923                         prot->name);
3924                 return -ENOMEM;
3925         }
3926         return 0;
3927 }
3928
3929 int proto_register(struct proto *prot, int alloc_slab)
3930 {
3931         int ret = -ENOBUFS;
3932
3933         if (prot->memory_allocated && !prot->sysctl_mem) {
3934                 pr_err("%s: missing sysctl_mem\n", prot->name);
3935                 return -EINVAL;
3936         }
3937         if (prot->memory_allocated && !prot->per_cpu_fw_alloc) {
3938                 pr_err("%s: missing per_cpu_fw_alloc\n", prot->name);
3939                 return -EINVAL;
3940         }
3941         if (alloc_slab) {
3942                 prot->slab = kmem_cache_create_usercopy(prot->name,
3943                                         prot->obj_size, 0,
3944                                         SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
3945                                         prot->slab_flags,
3946                                         prot->useroffset, prot->usersize,
3947                                         NULL);
3948
3949                 if (prot->slab == NULL) {
3950                         pr_crit("%s: Can't create sock SLAB cache!\n",
3951                                 prot->name);
3952                         goto out;
3953                 }
3954
3955                 if (req_prot_init(prot))
3956                         goto out_free_request_sock_slab;
3957
3958                 if (tw_prot_init(prot))
3959                         goto out_free_timewait_sock_slab;
3960         }
3961
3962         mutex_lock(&proto_list_mutex);
3963         ret = assign_proto_idx(prot);
3964         if (ret) {
3965                 mutex_unlock(&proto_list_mutex);
3966                 goto out_free_timewait_sock_slab;
3967         }
3968         list_add(&prot->node, &proto_list);
3969         mutex_unlock(&proto_list_mutex);
3970         return ret;
3971
3972 out_free_timewait_sock_slab:
3973         if (alloc_slab)
3974                 tw_prot_cleanup(prot->twsk_prot);
3975 out_free_request_sock_slab:
3976         if (alloc_slab) {
3977                 req_prot_cleanup(prot->rsk_prot);
3978
3979                 kmem_cache_destroy(prot->slab);
3980                 prot->slab = NULL;
3981         }
3982 out:
3983         return ret;
3984 }
3985 EXPORT_SYMBOL(proto_register);
3986
3987 void proto_unregister(struct proto *prot)
3988 {
3989         mutex_lock(&proto_list_mutex);
3990         release_proto_idx(prot);
3991         list_del(&prot->node);
3992         mutex_unlock(&proto_list_mutex);
3993
3994         kmem_cache_destroy(prot->slab);
3995         prot->slab = NULL;
3996
3997         req_prot_cleanup(prot->rsk_prot);
3998         tw_prot_cleanup(prot->twsk_prot);
3999 }
4000 EXPORT_SYMBOL(proto_unregister);
4001
4002 int sock_load_diag_module(int family, int protocol)
4003 {
4004         if (!protocol) {
4005                 if (!sock_is_registered(family))
4006                         return -ENOENT;
4007
4008                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
4009                                       NETLINK_SOCK_DIAG, family);
4010         }
4011
4012 #ifdef CONFIG_INET
4013         if (family == AF_INET &&
4014             protocol != IPPROTO_RAW &&
4015             protocol < MAX_INET_PROTOS &&
4016             !rcu_access_pointer(inet_protos[protocol]))
4017                 return -ENOENT;
4018 #endif
4019
4020         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
4021                               NETLINK_SOCK_DIAG, family, protocol);
4022 }
4023 EXPORT_SYMBOL(sock_load_diag_module);
4024
4025 #ifdef CONFIG_PROC_FS
4026 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
4027         __acquires(proto_list_mutex)
4028 {
4029         mutex_lock(&proto_list_mutex);
4030         return seq_list_start_head(&proto_list, *pos);
4031 }
4032
4033 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4034 {
4035         return seq_list_next(v, &proto_list, pos);
4036 }
4037
4038 static void proto_seq_stop(struct seq_file *seq, void *v)
4039         __releases(proto_list_mutex)
4040 {
4041         mutex_unlock(&proto_list_mutex);
4042 }
4043
4044 static char proto_method_implemented(const void *method)
4045 {
4046         return method == NULL ? 'n' : 'y';
4047 }
4048 static long sock_prot_memory_allocated(struct proto *proto)
4049 {
4050         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
4051 }
4052
4053 static const char *sock_prot_memory_pressure(struct proto *proto)
4054 {
4055         return proto->memory_pressure != NULL ?
4056         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
4057 }
4058
4059 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
4060 {
4061
4062         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
4063                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4064                    proto->name,
4065                    proto->obj_size,
4066                    sock_prot_inuse_get(seq_file_net(seq), proto),
4067                    sock_prot_memory_allocated(proto),
4068                    sock_prot_memory_pressure(proto),
4069                    proto->max_header,
4070                    proto->slab == NULL ? "no" : "yes",
4071                    module_name(proto->owner),
4072                    proto_method_implemented(proto->close),
4073                    proto_method_implemented(proto->connect),
4074                    proto_method_implemented(proto->disconnect),
4075                    proto_method_implemented(proto->accept),
4076                    proto_method_implemented(proto->ioctl),
4077                    proto_method_implemented(proto->init),
4078                    proto_method_implemented(proto->destroy),
4079                    proto_method_implemented(proto->shutdown),
4080                    proto_method_implemented(proto->setsockopt),
4081                    proto_method_implemented(proto->getsockopt),
4082                    proto_method_implemented(proto->sendmsg),
4083                    proto_method_implemented(proto->recvmsg),
4084                    proto_method_implemented(proto->bind),
4085                    proto_method_implemented(proto->backlog_rcv),
4086                    proto_method_implemented(proto->hash),
4087                    proto_method_implemented(proto->unhash),
4088                    proto_method_implemented(proto->get_port),
4089                    proto_method_implemented(proto->enter_memory_pressure));
4090 }
4091
4092 static int proto_seq_show(struct seq_file *seq, void *v)
4093 {
4094         if (v == &proto_list)
4095                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4096                            "protocol",
4097                            "size",
4098                            "sockets",
4099                            "memory",
4100                            "press",
4101                            "maxhdr",
4102                            "slab",
4103                            "module",
4104                            "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4105         else
4106                 proto_seq_printf(seq, list_entry(v, struct proto, node));
4107         return 0;
4108 }
4109
4110 static const struct seq_operations proto_seq_ops = {
4111         .start  = proto_seq_start,
4112         .next   = proto_seq_next,
4113         .stop   = proto_seq_stop,
4114         .show   = proto_seq_show,
4115 };
4116
4117 static __net_init int proto_init_net(struct net *net)
4118 {
4119         if (!proc_create_net("protocols", 0444, net->proc_net, &proto_seq_ops,
4120                         sizeof(struct seq_net_private)))
4121                 return -ENOMEM;
4122
4123         return 0;
4124 }
4125
4126 static __net_exit void proto_exit_net(struct net *net)
4127 {
4128         remove_proc_entry("protocols", net->proc_net);
4129 }
4130
4131
4132 static __net_initdata struct pernet_operations proto_net_ops = {
4133         .init = proto_init_net,
4134         .exit = proto_exit_net,
4135 };
4136
4137 static int __init proto_init(void)
4138 {
4139         return register_pernet_subsys(&proto_net_ops);
4140 }
4141
4142 subsys_initcall(proto_init);
4143
4144 #endif /* PROC_FS */
4145
4146 #ifdef CONFIG_NET_RX_BUSY_POLL
4147 bool sk_busy_loop_end(void *p, unsigned long start_time)
4148 {
4149         struct sock *sk = p;
4150
4151         return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
4152                sk_busy_loop_timeout(sk, start_time);
4153 }
4154 EXPORT_SYMBOL(sk_busy_loop_end);
4155 #endif /* CONFIG_NET_RX_BUSY_POLL */
4156
4157 int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len)
4158 {
4159         if (!sk->sk_prot->bind_add)
4160                 return -EOPNOTSUPP;
4161         return sk->sk_prot->bind_add(sk, addr, addr_len);
4162 }
4163 EXPORT_SYMBOL(sock_bind_add);
4164
4165 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4166 int sock_ioctl_inout(struct sock *sk, unsigned int cmd,
4167                      void __user *arg, void *karg, size_t size)
4168 {
4169         int ret;
4170
4171         if (copy_from_user(karg, arg, size))
4172                 return -EFAULT;
4173
4174         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, karg);
4175         if (ret)
4176                 return ret;
4177
4178         if (copy_to_user(arg, karg, size))
4179                 return -EFAULT;
4180
4181         return 0;
4182 }
4183 EXPORT_SYMBOL(sock_ioctl_inout);
4184
4185 /* This is the most common ioctl prep function, where the result (4 bytes) is
4186  * copied back to userspace if the ioctl() returns successfully. No input is
4187  * copied from userspace as input argument.
4188  */
4189 static int sock_ioctl_out(struct sock *sk, unsigned int cmd, void __user *arg)
4190 {
4191         int ret, karg = 0;
4192
4193         ret = READ_ONCE(sk->sk_prot)->ioctl(sk, cmd, &karg);
4194         if (ret)
4195                 return ret;
4196
4197         return put_user(karg, (int __user *)arg);
4198 }
4199
4200 /* A wrapper around sock ioctls, which copies the data from userspace
4201  * (depending on the protocol/ioctl), and copies back the result to userspace.
4202  * The main motivation for this function is to pass kernel memory to the
4203  * protocol ioctl callbacks, instead of userspace memory.
4204  */
4205 int sk_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
4206 {
4207         int rc = 1;
4208
4209         if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET)
4210                 rc = ipmr_sk_ioctl(sk, cmd, arg);
4211         else if (sk->sk_type == SOCK_RAW && sk->sk_family == AF_INET6)
4212                 rc = ip6mr_sk_ioctl(sk, cmd, arg);
4213         else if (sk_is_phonet(sk))
4214                 rc = phonet_sk_ioctl(sk, cmd, arg);
4215
4216         /* If ioctl was processed, returns its value */
4217         if (rc <= 0)
4218                 return rc;
4219
4220         /* Otherwise call the default handler */
4221         return sock_ioctl_out(sk, cmd, arg);
4222 }
4223 EXPORT_SYMBOL(sk_ioctl);