net/core/sock.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Generic socket support routines. Memory allocators, socket lock/release
   7  *              handler for protocols to use and generic option handler.
   8  *
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Florian La Roche, <flla@stud.uni-sb.de>
  13  *              Alan Cox, <A.Cox@swansea.ac.uk>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Numerous verify_area() problems
  17  *              Alan Cox        :       Connecting on a connecting socket
  18  *                                      now returns an error for tcp.
  19  *              Alan Cox        :       sock->protocol is set correctly.
  20  *                                      and is not sometimes left as 0.
  21  *              Alan Cox        :       connect handles icmp errors on a
  22  *                                      connect properly. Unfortunately there
  23  *                                      is a restart syscall nasty there. I
  24  *                                      can't match BSD without hacking the C
  25  *                                      library. Ideas urgently sought!
  26  *              Alan Cox        :       Disallow bind() to addresses that are
  27  *                                      not ours - especially broadcast ones!!
  28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
  29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
  30  *                                      instead they leave that for the DESTROY timer.
  31  *              Alan Cox        :       Clean up error flag in accept
  32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  33  *                                      was buggy. Put a remove_sock() in the handler
  34  *                                      for memory when we hit 0. Also altered the timer
  35  *                                      code. The ACK stuff can wait and needs major
  36  *                                      TCP layer surgery.
  37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
  38  *                                      and fixed timer/inet_bh race.
  39  *              Alan Cox        :       Added zapped flag for TCP
  40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
  41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
  42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
  43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
  44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
  45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
  46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
  47  *      Pauline Middelink       :       identd support
  48  *              Alan Cox        :       Fixed connect() taking signals I think.
  49  *              Alan Cox        :       SO_LINGER supported
  50  *              Alan Cox        :       Error reporting fixes
  51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
  52  *              Alan Cox        :       inet sockets don't set sk->type!
  53  *              Alan Cox        :       Split socket option code
  54  *              Alan Cox        :       Callbacks
  55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
  56  *              Alex            :       Removed restriction on inet fioctl
  57  *              Alan Cox        :       Splitting INET from NET core
  58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
  59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
  60  *              Alan Cox        :       Split IP from generic code
  61  *              Alan Cox        :       New kfree_skbmem()
  62  *              Alan Cox        :       Make SO_DEBUG superuser only.
  63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
  64  *                                      (compatibility fix)
  65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
  66  *              Alan Cox        :       Allocator for a socket is settable.
  67  *              Alan Cox        :       SO_ERROR includes soft errors.
  68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
  69  *              Alan Cox        :       Generic socket allocation to make hooks
  70  *                                      easier (suggested by Craig Metz).
  71  *              Michael Pall    :       SO_ERROR returns positive errno again
  72  *              Steve Whitehouse:       Added default destructor to free
  73  *                                      protocol private data.
  74  *              Steve Whitehouse:       Added various other default routines
  75  *                                      common to several socket families.
  76  *              Chris Evans     :       Call suser() check last on F_SETOWN
  77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
  78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
  79  *              Andi Kleen      :       Fix write_space callback
  80  *              Chris Evans     :       Security fixes - signedness again
  81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
  82  *
  83  * To Fix:
  84  *
  85  *
  86  *              This program is free software; you can redistribute it and/or
  87  *              modify it under the terms of the GNU General Public License
  88  *              as published by the Free Software Foundation; either version
  89  *              2 of the License, or (at your option) any later version.
  90  */
  91
  92 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  93
  94 #include <linux/capability.h>
  95 #include <linux/errno.h>
  96 #include <linux/errqueue.h>
  97 #include <linux/types.h>
  98 #include <linux/socket.h>
  99 #include <linux/in.h>
 100 #include <linux/kernel.h>
 101 #include <linux/module.h>
 102 #include <linux/proc_fs.h>
 103 #include <linux/seq_file.h>
 104 #include <linux/sched.h>
 105 #include <linux/sched/mm.h>
 106 #include <linux/timer.h>
 107 #include <linux/string.h>
 108 #include <linux/sockios.h>
 109 #include <linux/net.h>
 110 #include <linux/mm.h>
 111 #include <linux/slab.h>
 112 #include <linux/interrupt.h>
 113 #include <linux/poll.h>
 114 #include <linux/tcp.h>
 115 #include <linux/init.h>
 116 #include <linux/highmem.h>
 117 #include <linux/user_namespace.h>
 118 #include <linux/static_key.h>
 119 #include <linux/memcontrol.h>
 120 #include <linux/prefetch.h>
 121
 122 #include <linux/uaccess.h>
 123
 124 #include <linux/netdevice.h>
 125 #include <net/protocol.h>
 126 #include <linux/skbuff.h>
 127 #include <net/net_namespace.h>
 128 #include <net/request_sock.h>
 129 #include <net/sock.h>
 130 #include <linux/net_tstamp.h>
 131 #include <net/xfrm.h>
 132 #include <linux/ipsec.h>
 133 #include <net/cls_cgroup.h>
 134 #include <net/netprio_cgroup.h>
 135 #include <linux/sock_diag.h>
 136
 137 #include <linux/filter.h>
 138 #include <net/sock_reuseport.h>
 139
 140 #include <trace/events/sock.h>
 141
 142 #include <net/tcp.h>
 143 #include <net/busy_poll.h>
 144
 145 static DEFINE_MUTEX(proto_list_mutex);
 146 static LIST_HEAD(proto_list);
 147
 148 static void sock_inuse_add(struct net *net, int val);
 149
 150 /**
 151  * sk_ns_capable - General socket capability test
 152  * @sk: Socket to use a capability on or through
 153  * @user_ns: The user namespace of the capability to use
 154  * @cap: The capability to use
 155  *
 156  * Test to see if the opener of the socket had when the socket was
 157  * created and the current process has the capability @cap in the user
 158  * namespace @user_ns.
 159  */
 160 bool sk_ns_capable(const struct sock *sk,
 161                    struct user_namespace *user_ns, int cap)
 162 {
 163         return file_ns_capable(sk->sk_socket->file, user_ns, cap) &&
 164                 ns_capable(user_ns, cap);
 165 }
 166 EXPORT_SYMBOL(sk_ns_capable);
 167
 168 /**
 169  * sk_capable - Socket global capability test
 170  * @sk: Socket to use a capability on or through
 171  * @cap: The global capability to use
 172  *
 173  * Test to see if the opener of the socket had when the socket was
 174  * created and the current process has the capability @cap in all user
 175  * namespaces.
 176  */
 177 bool sk_capable(const struct sock *sk, int cap)
 178 {
 179         return sk_ns_capable(sk, &init_user_ns, cap);
 180 }
 181 EXPORT_SYMBOL(sk_capable);
 182
 183 /**
 184  * sk_net_capable - Network namespace socket capability test
 185  * @sk: Socket to use a capability on or through
 186  * @cap: The capability to use
 187  *
 188  * Test to see if the opener of the socket had when the socket was created
 189  * and the current process has the capability @cap over the network namespace
 190  * the socket is a member of.
 191  */
 192 bool sk_net_capable(const struct sock *sk, int cap)
 193 {
 194         return sk_ns_capable(sk, sock_net(sk)->user_ns, cap);
 195 }
 196 EXPORT_SYMBOL(sk_net_capable);
 197
 198 /*
 199  * Each address family might have different locking rules, so we have
 200  * one slock key per address family and separate keys for internal and
 201  * userspace sockets.
 202  */
 203 static struct lock_class_key af_family_keys[AF_MAX];
 204 static struct lock_class_key af_family_kern_keys[AF_MAX];
 205 static struct lock_class_key af_family_slock_keys[AF_MAX];
 206 static struct lock_class_key af_family_kern_slock_keys[AF_MAX];
 207
 208 /*
 209  * Make lock validator output more readable. (we pre-construct these
 210  * strings build-time, so that runtime initialization of socket
 211  * locks is fast):
 212  */
 213
 214 #define _sock_locks(x)                                            \
 215   x "AF_UNSPEC",        x "AF_UNIX"     ,       x "AF_INET"     , \
 216   x "AF_AX25"  ,        x "AF_IPX"      ,       x "AF_APPLETALK", \
 217   x "AF_NETROM",        x "AF_BRIDGE"   ,       x "AF_ATMPVC"   , \
 218   x "AF_X25"   ,        x "AF_INET6"    ,       x "AF_ROSE"     , \
 219   x "AF_DECnet",        x "AF_NETBEUI"  ,       x "AF_SECURITY" , \
 220   x "AF_KEY"   ,        x "AF_NETLINK"  ,       x "AF_PACKET"   , \
 221   x "AF_ASH"   ,        x "AF_ECONET"   ,       x "AF_ATMSVC"   , \
 222   x "AF_RDS"   ,        x "AF_SNA"      ,       x "AF_IRDA"     , \
 223   x "AF_PPPOX" ,        x "AF_WANPIPE"  ,       x "AF_LLC"      , \
 224   x "27"       ,        x "28"          ,       x "AF_CAN"      , \
 225   x "AF_TIPC"  ,        x "AF_BLUETOOTH",       x "IUCV"        , \
 226   x "AF_RXRPC" ,        x "AF_ISDN"     ,       x "AF_PHONET"   , \
 227   x "AF_IEEE802154",    x "AF_CAIF"     ,       x "AF_ALG"      , \
 228   x "AF_NFC"   ,        x "AF_VSOCK"    ,       x "AF_KCM"      , \
 229   x "AF_QIPCRTR",       x "AF_SMC"      ,       x "AF_XDP"      , \
 230   x "AF_MAX"
 231
 232 static const char *const af_family_key_strings[AF_MAX+1] = {
 233         _sock_locks("sk_lock-")
 234 };
 235 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
 236         _sock_locks("slock-")
 237 };
 238 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
 239         _sock_locks("clock-")
 240 };
 241
 242 static const char *const af_family_kern_key_strings[AF_MAX+1] = {
 243         _sock_locks("k-sk_lock-")
 244 };
 245 static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = {
 246         _sock_locks("k-slock-")
 247 };
 248 static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = {
 249         _sock_locks("k-clock-")
 250 };
 251 static const char *const af_family_rlock_key_strings[AF_MAX+1] = {
 252   "rlock-AF_UNSPEC", "rlock-AF_UNIX"     , "rlock-AF_INET"     ,
 253   "rlock-AF_AX25"  , "rlock-AF_IPX"      , "rlock-AF_APPLETALK",
 254   "rlock-AF_NETROM", "rlock-AF_BRIDGE"   , "rlock-AF_ATMPVC"   ,
 255   "rlock-AF_X25"   , "rlock-AF_INET6"    , "rlock-AF_ROSE"     ,
 256   "rlock-AF_DECnet", "rlock-AF_NETBEUI"  , "rlock-AF_SECURITY" ,
 257   "rlock-AF_KEY"   , "rlock-AF_NETLINK"  , "rlock-AF_PACKET"   ,
 258   "rlock-AF_ASH"   , "rlock-AF_ECONET"   , "rlock-AF_ATMSVC"   ,
 259   "rlock-AF_RDS"   , "rlock-AF_SNA"      , "rlock-AF_IRDA"     ,
 260   "rlock-AF_PPPOX" , "rlock-AF_WANPIPE"  , "rlock-AF_LLC"      ,
 261   "rlock-27"       , "rlock-28"          , "rlock-AF_CAN"      ,
 262   "rlock-AF_TIPC"  , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV"     ,
 263   "rlock-AF_RXRPC" , "rlock-AF_ISDN"     , "rlock-AF_PHONET"   ,
 264   "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG"      ,
 265   "rlock-AF_NFC"   , "rlock-AF_VSOCK"    , "rlock-AF_KCM"      ,
 266   "rlock-AF_QIPCRTR", "rlock-AF_SMC"     , "rlock-AF_XDP"      ,
 267   "rlock-AF_MAX"
 268 };
 269 static const char *const af_family_wlock_key_strings[AF_MAX+1] = {
 270   "wlock-AF_UNSPEC", "wlock-AF_UNIX"     , "wlock-AF_INET"     ,
 271   "wlock-AF_AX25"  , "wlock-AF_IPX"      , "wlock-AF_APPLETALK",
 272   "wlock-AF_NETROM", "wlock-AF_BRIDGE"   , "wlock-AF_ATMPVC"   ,
 273   "wlock-AF_X25"   , "wlock-AF_INET6"    , "wlock-AF_ROSE"     ,
 274   "wlock-AF_DECnet", "wlock-AF_NETBEUI"  , "wlock-AF_SECURITY" ,
 275   "wlock-AF_KEY"   , "wlock-AF_NETLINK"  , "wlock-AF_PACKET"   ,
 276   "wlock-AF_ASH"   , "wlock-AF_ECONET"   , "wlock-AF_ATMSVC"   ,
 277   "wlock-AF_RDS"   , "wlock-AF_SNA"      , "wlock-AF_IRDA"     ,
 278   "wlock-AF_PPPOX" , "wlock-AF_WANPIPE"  , "wlock-AF_LLC"      ,
 279   "wlock-27"       , "wlock-28"          , "wlock-AF_CAN"      ,
 280   "wlock-AF_TIPC"  , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV"     ,
 281   "wlock-AF_RXRPC" , "wlock-AF_ISDN"     , "wlock-AF_PHONET"   ,
 282   "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG"      ,
 283   "wlock-AF_NFC"   , "wlock-AF_VSOCK"    , "wlock-AF_KCM"      ,
 284   "wlock-AF_QIPCRTR", "wlock-AF_SMC"     , "wlock-AF_XDP"      ,
 285   "wlock-AF_MAX"
 286 };
 287 static const char *const af_family_elock_key_strings[AF_MAX+1] = {
 288   "elock-AF_UNSPEC", "elock-AF_UNIX"     , "elock-AF_INET"     ,
 289   "elock-AF_AX25"  , "elock-AF_IPX"      , "elock-AF_APPLETALK",
 290   "elock-AF_NETROM", "elock-AF_BRIDGE"   , "elock-AF_ATMPVC"   ,
 291   "elock-AF_X25"   , "elock-AF_INET6"    , "elock-AF_ROSE"     ,
 292   "elock-AF_DECnet", "elock-AF_NETBEUI"  , "elock-AF_SECURITY" ,
 293   "elock-AF_KEY"   , "elock-AF_NETLINK"  , "elock-AF_PACKET"   ,
 294   "elock-AF_ASH"   , "elock-AF_ECONET"   , "elock-AF_ATMSVC"   ,
 295   "elock-AF_RDS"   , "elock-AF_SNA"      , "elock-AF_IRDA"     ,
 296   "elock-AF_PPPOX" , "elock-AF_WANPIPE"  , "elock-AF_LLC"      ,
 297   "elock-27"       , "elock-28"          , "elock-AF_CAN"      ,
 298   "elock-AF_TIPC"  , "elock-AF_BLUETOOTH", "elock-AF_IUCV"     ,
 299   "elock-AF_RXRPC" , "elock-AF_ISDN"     , "elock-AF_PHONET"   ,
 300   "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG"      ,
 301   "elock-AF_NFC"   , "elock-AF_VSOCK"    , "elock-AF_KCM"      ,
 302   "elock-AF_QIPCRTR", "elock-AF_SMC"     , "elock-AF_XDP"      ,
 303   "elock-AF_MAX"
 304 };
 305
 306 /*
 307  * sk_callback_lock and sk queues locking rules are per-address-family,
 308  * so split the lock classes by using a per-AF key:
 309  */
 310 static struct lock_class_key af_callback_keys[AF_MAX];
 311 static struct lock_class_key af_rlock_keys[AF_MAX];
 312 static struct lock_class_key af_wlock_keys[AF_MAX];
 313 static struct lock_class_key af_elock_keys[AF_MAX];
 314 static struct lock_class_key af_kern_callback_keys[AF_MAX];
 315
 316 /* Run time adjustable parameters. */
 317 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
 318 EXPORT_SYMBOL(sysctl_wmem_max);
 319 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
 320 EXPORT_SYMBOL(sysctl_rmem_max);
 321 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
 322 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 323
 324 /* Maximal space eaten by iovec or ancillary data plus some space */
 325 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 326 EXPORT_SYMBOL(sysctl_optmem_max);
 327
 328 int sysctl_tstamp_allow_data __read_mostly = 1;
 329
 330 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key);
 331 EXPORT_SYMBOL_GPL(memalloc_socks_key);
 332
 333 /**
 334  * sk_set_memalloc - sets %SOCK_MEMALLOC
 335  * @sk: socket to set it on
 336  *
 337  * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
 338  * It's the responsibility of the admin to adjust min_free_kbytes
 339  * to meet the requirements
 340  */
 341 void sk_set_memalloc(struct sock *sk)
 342 {
 343         sock_set_flag(sk, SOCK_MEMALLOC);
 344         sk->sk_allocation |= __GFP_MEMALLOC;
 345         static_branch_inc(&memalloc_socks_key);
 346 }
 347 EXPORT_SYMBOL_GPL(sk_set_memalloc);
 348
 349 void sk_clear_memalloc(struct sock *sk)
 350 {
 351         sock_reset_flag(sk, SOCK_MEMALLOC);
 352         sk->sk_allocation &= ~__GFP_MEMALLOC;
 353         static_branch_dec(&memalloc_socks_key);
 354
 355         /*
 356          * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
 357          * progress of swapping. SOCK_MEMALLOC may be cleared while
 358          * it has rmem allocations due to the last swapfile being deactivated
 359          * but there is a risk that the socket is unusable due to exceeding
 360          * the rmem limits. Reclaim the reserves and obey rmem limits again.
 361          */
 362         sk_mem_reclaim(sk);
 363 }
 364 EXPORT_SYMBOL_GPL(sk_clear_memalloc);
 365
 366 int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
 367 {
 368         int ret;
 369         unsigned int noreclaim_flag;
 370
 371         /* these should have been dropped before queueing */
 372         BUG_ON(!sock_flag(sk, SOCK_MEMALLOC));
 373
 374         noreclaim_flag = memalloc_noreclaim_save();
 375         ret = sk->sk_backlog_rcv(sk, skb);
 376         memalloc_noreclaim_restore(noreclaim_flag);
 377
 378         return ret;
 379 }
 380 EXPORT_SYMBOL(__sk_backlog_rcv);
 381
 382 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 383 {
 384         struct timeval tv;
 385
 386         if (optlen < sizeof(tv))
 387                 return -EINVAL;
 388         if (copy_from_user(&tv, optval, sizeof(tv)))
 389                 return -EFAULT;
 390         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
 391                 return -EDOM;
 392
 393         if (tv.tv_sec < 0) {
 394                 static int warned __read_mostly;
 395
 396                 *timeo_p = 0;
 397                 if (warned < 10 && net_ratelimit()) {
 398                         warned++;
 399                         pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
 400                                 __func__, current->comm, task_pid_nr(current));
 401                 }
 402                 return 0;
 403         }
 404         *timeo_p = MAX_SCHEDULE_TIMEOUT;
 405         if (tv.tv_sec == 0 && tv.tv_usec == 0)
 406                 return 0;
 407         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
 408                 *timeo_p = tv.tv_sec * HZ + DIV_ROUND_UP(tv.tv_usec, USEC_PER_SEC / HZ);
 409         return 0;
 410 }
 411
 412 static void sock_warn_obsolete_bsdism(const char *name)
 413 {
 414         static int warned;
 415         static char warncomm[TASK_COMM_LEN];
 416         if (strcmp(warncomm, current->comm) && warned < 5) {
 417                 strcpy(warncomm,  current->comm);
 418                 pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
 419                         warncomm, name);
 420                 warned++;
 421         }
 422 }
 423
 424 static bool sock_needs_netstamp(const struct sock *sk)
 425 {
 426         switch (sk->sk_family) {
 427         case AF_UNSPEC:
 428         case AF_UNIX:
 429                 return false;
 430         default:
 431                 return true;
 432         }
 433 }
 434
 435 static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
 436 {
 437         if (sk->sk_flags & flags) {
 438                 sk->sk_flags &= ~flags;
 439                 if (sock_needs_netstamp(sk) &&
 440                     !(sk->sk_flags & SK_FLAGS_TIMESTAMP))
 441                         net_disable_timestamp();
 442         }
 443 }
 444
 445
 446 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 447 {
 448         unsigned long flags;
 449         struct sk_buff_head *list = &sk->sk_receive_queue;
 450
 451         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
 452                 atomic_inc(&sk->sk_drops);
 453                 trace_sock_rcvqueue_full(sk, skb);
 454                 return -ENOMEM;
 455         }
 456
 457         if (!sk_rmem_schedule(sk, skb, skb->truesize)) {
 458                 atomic_inc(&sk->sk_drops);
 459                 return -ENOBUFS;
 460         }
 461
 462         skb->dev = NULL;
 463         skb_set_owner_r(skb, sk);
 464
 465         /* we escape from rcu protected region, make sure we dont leak
 466          * a norefcounted dst
 467          */
 468         skb_dst_force(skb);
 469
 470         spin_lock_irqsave(&list->lock, flags);
 471         sock_skb_set_dropcount(sk, skb);
 472         __skb_queue_tail(list, skb);
 473         spin_unlock_irqrestore(&list->lock, flags);
 474
 475         if (!sock_flag(sk, SOCK_DEAD))
 476                 sk->sk_data_ready(sk);
 477         return 0;
 478 }
 479 EXPORT_SYMBOL(__sock_queue_rcv_skb);
 480
 481 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
 482 {
 483         int err;
 484
 485         err = sk_filter(sk, skb);
 486         if (err)
 487                 return err;
 488
 489         return __sock_queue_rcv_skb(sk, skb);
 490 }
 491 EXPORT_SYMBOL(sock_queue_rcv_skb);
 492
 493 int __sk_receive_skb(struct sock *sk, struct sk_buff *skb,
 494                      const int nested, unsigned int trim_cap, bool refcounted)
 495 {
 496         int rc = NET_RX_SUCCESS;
 497
 498         if (sk_filter_trim_cap(sk, skb, trim_cap))
 499                 goto discard_and_relse;
 500
 501         skb->dev = NULL;
 502
 503         if (sk_rcvqueues_full(sk, sk->sk_rcvbuf)) {
 504                 atomic_inc(&sk->sk_drops);
 505                 goto discard_and_relse;
 506         }
 507         if (nested)
 508                 bh_lock_sock_nested(sk);
 509         else
 510                 bh_lock_sock(sk);
 511         if (!sock_owned_by_user(sk)) {
 512                 /*
 513                  * trylock + unlock semantics:
 514                  */
 515                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 516
 517                 rc = sk_backlog_rcv(sk, skb);
 518
 519                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
 520         } else if (sk_add_backlog(sk, skb, sk->sk_rcvbuf)) {
 521                 bh_unlock_sock(sk);
 522                 atomic_inc(&sk->sk_drops);
 523                 goto discard_and_relse;
 524         }
 525
 526         bh_unlock_sock(sk);
 527 out:
 528         if (refcounted)
 529                 sock_put(sk);
 530         return rc;
 531 discard_and_relse:
 532         kfree_skb(skb);
 533         goto out;
 534 }
 535 EXPORT_SYMBOL(__sk_receive_skb);
 536
 537 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
 538 {
 539         struct dst_entry *dst = __sk_dst_get(sk);
 540
 541         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 542                 sk_tx_queue_clear(sk);
 543                 sk->sk_dst_pending_confirm = 0;
 544                 RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
 545                 dst_release(dst);
 546                 return NULL;
 547         }
 548
 549         return dst;
 550 }
 551 EXPORT_SYMBOL(__sk_dst_check);
 552
 553 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
 554 {
 555         struct dst_entry *dst = sk_dst_get(sk);
 556
 557         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
 558                 sk_dst_reset(sk);
 559                 dst_release(dst);
 560                 return NULL;
 561         }
 562
 563         return dst;
 564 }
 565 EXPORT_SYMBOL(sk_dst_check);
 566
 567 static int sock_setbindtodevice(struct sock *sk, char __user *optval,
 568                                 int optlen)
 569 {
 570         int ret = -ENOPROTOOPT;
 571 #ifdef CONFIG_NETDEVICES
 572         struct net *net = sock_net(sk);
 573         char devname[IFNAMSIZ];
 574         int index;
 575
 576         /* Sorry... */
 577         ret = -EPERM;
 578         if (!ns_capable(net->user_ns, CAP_NET_RAW))
 579                 goto out;
 580
 581         ret = -EINVAL;
 582         if (optlen < 0)
 583                 goto out;
 584
 585         /* Bind this socket to a particular device like "eth0",
 586          * as specified in the passed interface name. If the
 587          * name is "" or the option length is zero the socket
 588          * is not bound.
 589          */
 590         if (optlen > IFNAMSIZ - 1)
 591                 optlen = IFNAMSIZ - 1;
 592         memset(devname, 0, sizeof(devname));
 593
 594         ret = -EFAULT;
 595         if (copy_from_user(devname, optval, optlen))
 596                 goto out;
 597
 598         index = 0;
 599         if (devname[0] != '\0') {
 600                 struct net_device *dev;
 601
 602                 rcu_read_lock();
 603                 dev = dev_get_by_name_rcu(net, devname);
 604                 if (dev)
 605                         index = dev->ifindex;
 606                 rcu_read_unlock();
 607                 ret = -ENODEV;
 608                 if (!dev)
 609                         goto out;
 610         }
 611
 612         lock_sock(sk);
 613         sk->sk_bound_dev_if = index;
 614         sk_dst_reset(sk);
 615         release_sock(sk);
 616
 617         ret = 0;
 618
 619 out:
 620 #endif
 621
 622         return ret;
 623 }
 624
 625 static int sock_getbindtodevice(struct sock *sk, char __user *optval,
 626                                 int __user *optlen, int len)
 627 {
 628         int ret = -ENOPROTOOPT;
 629 #ifdef CONFIG_NETDEVICES
 630         struct net *net = sock_net(sk);
 631         char devname[IFNAMSIZ];
 632
 633         if (sk->sk_bound_dev_if == 0) {
 634                 len = 0;
 635                 goto zero;
 636         }
 637
 638         ret = -EINVAL;
 639         if (len < IFNAMSIZ)
 640                 goto out;
 641
 642         ret = netdev_get_name(net, devname, sk->sk_bound_dev_if);
 643         if (ret)
 644                 goto out;
 645
 646         len = strlen(devname) + 1;
 647
 648         ret = -EFAULT;
 649         if (copy_to_user(optval, devname, len))
 650                 goto out;
 651
 652 zero:
 653         ret = -EFAULT;
 654         if (put_user(len, optlen))
 655                 goto out;
 656
 657         ret = 0;
 658
 659 out:
 660 #endif
 661
 662         return ret;
 663 }
 664
 665 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
 666 {
 667         if (valbool)
 668                 sock_set_flag(sk, bit);
 669         else
 670                 sock_reset_flag(sk, bit);
 671 }
 672
 673 bool sk_mc_loop(struct sock *sk)
 674 {
 675         if (dev_recursion_level())
 676                 return false;
 677         if (!sk)
 678                 return true;
 679         switch (sk->sk_family) {
 680         case AF_INET:
 681                 return inet_sk(sk)->mc_loop;
 682 #if IS_ENABLED(CONFIG_IPV6)
 683         case AF_INET6:
 684                 return inet6_sk(sk)->mc_loop;
 685 #endif
 686         }
 687         WARN_ON(1);
 688         return true;
 689 }
 690 EXPORT_SYMBOL(sk_mc_loop);
 691
 692 /*
 693  *      This is meant for all protocols to use and covers goings on
 694  *      at the socket level. Everything here is generic.
 695  */
 696
 697 int sock_setsockopt(struct socket *sock, int level, int optname,
 698                     char __user *optval, unsigned int optlen)
 699 {
 700         struct sock *sk = sock->sk;
 701         int val;
 702         int valbool;
 703         struct linger ling;
 704         int ret = 0;
 705
 706         /*
 707          *      Options without arguments
 708          */
 709
 710         if (optname == SO_BINDTODEVICE)
 711                 return sock_setbindtodevice(sk, optval, optlen);
 712
 713         if (optlen < sizeof(int))
 714                 return -EINVAL;
 715
 716         if (get_user(val, (int __user *)optval))
 717                 return -EFAULT;
 718
 719         valbool = val ? 1 : 0;
 720
 721         lock_sock(sk);
 722
 723         switch (optname) {
 724         case SO_DEBUG:
 725                 if (val && !capable(CAP_NET_ADMIN))
 726                         ret = -EACCES;
 727                 else
 728                         sock_valbool_flag(sk, SOCK_DBG, valbool);
 729                 break;
 730         case SO_REUSEADDR:
 731                 val = (valbool ? SK_CAN_REUSE : SK_NO_REUSE);
 732                 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
 733                     inet_sk(sk)->inet_num &&
 734                     (sk->sk_reuse != val)) {
 735                         ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
 736                         break;
 737                 }
 738                 sk->sk_reuse = val;
 739                 break;
 740         case SO_REUSEPORT:
 741                 if ((sk->sk_family == PF_INET || sk->sk_family == PF_INET6) &&
 742                     inet_sk(sk)->inet_num &&
 743                     (sk->sk_reuseport != valbool)) {
 744                         ret = (sk->sk_state == TCP_ESTABLISHED) ? -EISCONN : -EUCLEAN;
 745                         break;
 746                 }
 747                 sk->sk_reuseport = valbool;
 748                 break;
 749         case SO_TYPE:
 750         case SO_PROTOCOL:
 751         case SO_DOMAIN:
 752         case SO_ERROR:
 753                 ret = -ENOPROTOOPT;
 754                 break;
 755         case SO_DONTROUTE:
 756                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
 757                 break;
 758         case SO_BROADCAST:
 759                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
 760                 break;
 761         case SO_SNDBUF:
 762                 /* Don't error on this BSD doesn't and if you think
 763                  * about it this is right. Otherwise apps have to
 764                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 765                  * are treated in BSD as hints
 766                  */
 767                 val = min_t(u32, val, sysctl_wmem_max);
 768 set_sndbuf:
 769                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
 770                 sk->sk_sndbuf = max_t(int, val * 2, SOCK_MIN_SNDBUF);
 771                 /* Wake up sending tasks if we upped the value. */
 772                 sk->sk_write_space(sk);
 773                 break;
 774
 775         case SO_SNDBUFFORCE:
 776                 if (!capable(CAP_NET_ADMIN)) {
 777                         ret = -EPERM;
 778                         break;
 779                 }
 780                 goto set_sndbuf;
 781
 782         case SO_RCVBUF:
 783                 /* Don't error on this BSD doesn't and if you think
 784                  * about it this is right. Otherwise apps have to
 785                  * play 'guess the biggest size' games. RCVBUF/SNDBUF
 786                  * are treated in BSD as hints
 787                  */
 788                 val = min_t(u32, val, sysctl_rmem_max);
 789 set_rcvbuf:
 790                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
 791                 /*
 792                  * We double it on the way in to account for
 793                  * "struct sk_buff" etc. overhead.   Applications
 794                  * assume that the SO_RCVBUF setting they make will
 795                  * allow that much actual data to be received on that
 796                  * socket.
 797                  *
 798                  * Applications are unaware that "struct sk_buff" and
 799                  * other overheads allocate from the receive buffer
 800                  * during socket buffer allocation.
 801                  *
 802                  * And after considering the possible alternatives,
 803                  * returning the value we actually used in getsockopt
 804                  * is the most desirable behavior.
 805                  */
 806                 sk->sk_rcvbuf = max_t(int, val * 2, SOCK_MIN_RCVBUF);
 807                 break;
 808
 809         case SO_RCVBUFFORCE:
 810                 if (!capable(CAP_NET_ADMIN)) {
 811                         ret = -EPERM;
 812                         break;
 813                 }
 814                 goto set_rcvbuf;
 815
 816         case SO_KEEPALIVE:
 817                 if (sk->sk_prot->keepalive)
 818                         sk->sk_prot->keepalive(sk, valbool);
 819                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
 820                 break;
 821
 822         case SO_OOBINLINE:
 823                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
 824                 break;
 825
 826         case SO_NO_CHECK:
 827                 sk->sk_no_check_tx = valbool;
 828                 break;
 829
 830         case SO_PRIORITY:
 831                 if ((val >= 0 && val <= 6) ||
 832                     ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
 833                         sk->sk_priority = val;
 834                 else
 835                         ret = -EPERM;
 836                 break;
 837
 838         case SO_LINGER:
 839                 if (optlen < sizeof(ling)) {
 840                         ret = -EINVAL;  /* 1003.1g */
 841                         break;
 842                 }
 843                 if (copy_from_user(&ling, optval, sizeof(ling))) {
 844                         ret = -EFAULT;
 845                         break;
 846                 }
 847                 if (!ling.l_onoff)
 848                         sock_reset_flag(sk, SOCK_LINGER);
 849                 else {
 850 #if (BITS_PER_LONG == 32)
 851                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
 852                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
 853                         else
 854 #endif
 855                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
 856                         sock_set_flag(sk, SOCK_LINGER);
 857                 }
 858                 break;
 859
 860         case SO_BSDCOMPAT:
 861                 sock_warn_obsolete_bsdism("setsockopt");
 862                 break;
 863
 864         case SO_PASSCRED:
 865                 if (valbool)
 866                         set_bit(SOCK_PASSCRED, &sock->flags);
 867                 else
 868                         clear_bit(SOCK_PASSCRED, &sock->flags);
 869                 break;
 870
 871         case SO_TIMESTAMP:
 872         case SO_TIMESTAMPNS:
 873                 if (valbool)  {
 874                         if (optname == SO_TIMESTAMP)
 875                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 876                         else
 877                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
 878                         sock_set_flag(sk, SOCK_RCVTSTAMP);
 879                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
 880                 } else {
 881                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
 882                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
 883                 }
 884                 break;
 885
 886         case SO_TIMESTAMPING:
 887                 if (val & ~SOF_TIMESTAMPING_MASK) {
 888                         ret = -EINVAL;
 889                         break;
 890                 }
 891
 892                 if (val & SOF_TIMESTAMPING_OPT_ID &&
 893                     !(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
 894                         if (sk->sk_protocol == IPPROTO_TCP &&
 895                             sk->sk_type == SOCK_STREAM) {
 896                                 if ((1 << sk->sk_state) &
 897                                     (TCPF_CLOSE | TCPF_LISTEN)) {
 898                                         ret = -EINVAL;
 899                                         break;
 900                                 }
 901                                 sk->sk_tskey = tcp_sk(sk)->snd_una;
 902                         } else {
 903                                 sk->sk_tskey = 0;
 904                         }
 905                 }
 906
 907                 if (val & SOF_TIMESTAMPING_OPT_STATS &&
 908                     !(val & SOF_TIMESTAMPING_OPT_TSONLY)) {
 909                         ret = -EINVAL;
 910                         break;
 911                 }
 912
 913                 sk->sk_tsflags = val;
 914                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
 915                         sock_enable_timestamp(sk,
 916                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
 917                 else
 918                         sock_disable_timestamp(sk,
 919                                                (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
 920                 break;
 921
 922         case SO_RCVLOWAT:
 923                 if (val < 0)
 924                         val = INT_MAX;
 925                 if (sock->ops->set_rcvlowat)
 926                         ret = sock->ops->set_rcvlowat(sk, val);
 927                 else
 928                         sk->sk_rcvlowat = val ? : 1;
 929                 break;
 930
 931         case SO_RCVTIMEO:
 932                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
 933                 break;
 934
 935         case SO_SNDTIMEO:
 936                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
 937                 break;
 938
 939         case SO_ATTACH_FILTER:
 940                 ret = -EINVAL;
 941                 if (optlen == sizeof(struct sock_fprog)) {
 942                         struct sock_fprog fprog;
 943
 944                         ret = -EFAULT;
 945                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 946                                 break;
 947
 948                         ret = sk_attach_filter(&fprog, sk);
 949                 }
 950                 break;
 951
 952         case SO_ATTACH_BPF:
 953                 ret = -EINVAL;
 954                 if (optlen == sizeof(u32)) {
 955                         u32 ufd;
 956
 957                         ret = -EFAULT;
 958                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 959                                 break;
 960
 961                         ret = sk_attach_bpf(ufd, sk);
 962                 }
 963                 break;
 964
 965         case SO_ATTACH_REUSEPORT_CBPF:
 966                 ret = -EINVAL;
 967                 if (optlen == sizeof(struct sock_fprog)) {
 968                         struct sock_fprog fprog;
 969
 970                         ret = -EFAULT;
 971                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
 972                                 break;
 973
 974                         ret = sk_reuseport_attach_filter(&fprog, sk);
 975                 }
 976                 break;
 977
 978         case SO_ATTACH_REUSEPORT_EBPF:
 979                 ret = -EINVAL;
 980                 if (optlen == sizeof(u32)) {
 981                         u32 ufd;
 982
 983                         ret = -EFAULT;
 984                         if (copy_from_user(&ufd, optval, sizeof(ufd)))
 985                                 break;
 986
 987                         ret = sk_reuseport_attach_bpf(ufd, sk);
 988                 }
 989                 break;
 990
 991         case SO_DETACH_FILTER:
 992                 ret = sk_detach_filter(sk);
 993                 break;
 994
 995         case SO_LOCK_FILTER:
 996                 if (sock_flag(sk, SOCK_FILTER_LOCKED) && !valbool)
 997                         ret = -EPERM;
 998                 else
 999                         sock_valbool_flag(sk, SOCK_FILTER_LOCKED, valbool);
1000                 break;
1001
1002         case SO_PASSSEC:
1003                 if (valbool)
1004                         set_bit(SOCK_PASSSEC, &sock->flags);
1005                 else
1006                         clear_bit(SOCK_PASSSEC, &sock->flags);
1007                 break;
1008         case SO_MARK:
1009                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
1010                         ret = -EPERM;
1011                 else
1012                         sk->sk_mark = val;
1013                 break;
1014
1015         case SO_RXQ_OVFL:
1016                 sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
1017                 break;
1018
1019         case SO_WIFI_STATUS:
1020                 sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
1021                 break;
1022
1023         case SO_PEEK_OFF:
1024                 if (sock->ops->set_peek_off)
1025                         ret = sock->ops->set_peek_off(sk, val);
1026                 else
1027                         ret = -EOPNOTSUPP;
1028                 break;
1029
1030         case SO_NOFCS:
1031                 sock_valbool_flag(sk, SOCK_NOFCS, valbool);
1032                 break;
1033
1034         case SO_SELECT_ERR_QUEUE:
1035                 sock_valbool_flag(sk, SOCK_SELECT_ERR_QUEUE, valbool);
1036                 break;
1037
1038 #ifdef CONFIG_NET_RX_BUSY_POLL
1039         case SO_BUSY_POLL:
1040                 /* allow unprivileged users to decrease the value */
1041                 if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN))
1042                         ret = -EPERM;
1043                 else {
1044                         if (val < 0)
1045                                 ret = -EINVAL;
1046                         else
1047                                 sk->sk_ll_usec = val;
1048                 }
1049                 break;
1050 #endif
1051
1052         case SO_MAX_PACING_RATE:
1053                 if (val != ~0U)
1054                         cmpxchg(&sk->sk_pacing_status,
1055                                 SK_PACING_NONE,
1056                                 SK_PACING_NEEDED);
1057                 sk->sk_max_pacing_rate = val;
1058                 sk->sk_pacing_rate = min(sk->sk_pacing_rate,
1059                                          sk->sk_max_pacing_rate);
1060                 break;
1061
1062         case SO_INCOMING_CPU:
1063                 sk->sk_incoming_cpu = val;
1064                 break;
1065
1066         case SO_CNX_ADVICE:
1067                 if (val == 1)
1068                         dst_negative_advice(sk);
1069                 break;
1070
1071         case SO_ZEROCOPY:
1072                 if (sk->sk_family == PF_INET || sk->sk_family == PF_INET6) {
1073                         if (sk->sk_protocol != IPPROTO_TCP)
1074                                 ret = -ENOTSUPP;
1075                 } else if (sk->sk_family != PF_RDS) {
1076                         ret = -ENOTSUPP;
1077                 }
1078                 if (!ret) {
1079                         if (val < 0 || val > 1)
1080                                 ret = -EINVAL;
1081                         else
1082                                 sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool);
1083                 }
1084                 break;
1085
1086         default:
1087                 ret = -ENOPROTOOPT;
1088                 break;
1089         }
1090         release_sock(sk);
1091         return ret;
1092 }
1093 EXPORT_SYMBOL(sock_setsockopt);
1094
1095
1096 static void cred_to_ucred(struct pid *pid, const struct cred *cred,
1097                           struct ucred *ucred)
1098 {
1099         ucred->pid = pid_vnr(pid);
1100         ucred->uid = ucred->gid = -1;
1101         if (cred) {
1102                 struct user_namespace *current_ns = current_user_ns();
1103
1104                 ucred->uid = from_kuid_munged(current_ns, cred->euid);
1105                 ucred->gid = from_kgid_munged(current_ns, cred->egid);
1106         }
1107 }
1108
1109 static int groups_to_user(gid_t __user *dst, const struct group_info *src)
1110 {
1111         struct user_namespace *user_ns = current_user_ns();
1112         int i;
1113
1114         for (i = 0; i < src->ngroups; i++)
1115                 if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
1116                         return -EFAULT;
1117
1118         return 0;
1119 }
1120
1121 int sock_getsockopt(struct socket *sock, int level, int optname,
1122                     char __user *optval, int __user *optlen)
1123 {
1124         struct sock *sk = sock->sk;
1125
1126         union {
1127                 int val;
1128                 u64 val64;
1129                 struct linger ling;
1130                 struct timeval tm;
1131         } v;
1132
1133         int lv = sizeof(int);
1134         int len;
1135
1136         if (get_user(len, optlen))
1137                 return -EFAULT;
1138         if (len < 0)
1139                 return -EINVAL;
1140
1141         memset(&v, 0, sizeof(v));
1142
1143         switch (optname) {
1144         case SO_DEBUG:
1145                 v.val = sock_flag(sk, SOCK_DBG);
1146                 break;
1147
1148         case SO_DONTROUTE:
1149                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
1150                 break;
1151
1152         case SO_BROADCAST:
1153                 v.val = sock_flag(sk, SOCK_BROADCAST);
1154                 break;
1155
1156         case SO_SNDBUF:
1157                 v.val = sk->sk_sndbuf;
1158                 break;
1159
1160         case SO_RCVBUF:
1161                 v.val = sk->sk_rcvbuf;
1162                 break;
1163
1164         case SO_REUSEADDR:
1165                 v.val = sk->sk_reuse;
1166                 break;
1167
1168         case SO_REUSEPORT:
1169                 v.val = sk->sk_reuseport;
1170                 break;
1171
1172         case SO_KEEPALIVE:
1173                 v.val = sock_flag(sk, SOCK_KEEPOPEN);
1174                 break;
1175
1176         case SO_TYPE:
1177                 v.val = sk->sk_type;
1178                 break;
1179
1180         case SO_PROTOCOL:
1181                 v.val = sk->sk_protocol;
1182                 break;
1183
1184         case SO_DOMAIN:
1185                 v.val = sk->sk_family;
1186                 break;
1187
1188         case SO_ERROR:
1189                 v.val = -sock_error(sk);
1190                 if (v.val == 0)
1191                         v.val = xchg(&sk->sk_err_soft, 0);
1192                 break;
1193
1194         case SO_OOBINLINE:
1195                 v.val = sock_flag(sk, SOCK_URGINLINE);
1196                 break;
1197
1198         case SO_NO_CHECK:
1199                 v.val = sk->sk_no_check_tx;
1200                 break;
1201
1202         case SO_PRIORITY:
1203                 v.val = sk->sk_priority;
1204                 break;
1205
1206         case SO_LINGER:
1207                 lv              = sizeof(v.ling);
1208                 v.ling.l_onoff  = sock_flag(sk, SOCK_LINGER);
1209                 v.ling.l_linger = sk->sk_lingertime / HZ;
1210                 break;
1211
1212         case SO_BSDCOMPAT:
1213                 sock_warn_obsolete_bsdism("getsockopt");
1214                 break;
1215
1216         case SO_TIMESTAMP:
1217                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
1218                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
1219                 break;
1220
1221         case SO_TIMESTAMPNS:
1222                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
1223                 break;
1224
1225         case SO_TIMESTAMPING:
1226                 v.val = sk->sk_tsflags;
1227                 break;
1228
1229         case SO_RCVTIMEO:
1230                 lv = sizeof(struct timeval);
1231                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
1232                         v.tm.tv_sec = 0;
1233                         v.tm.tv_usec = 0;
1234                 } else {
1235                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
1236                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * USEC_PER_SEC) / HZ;
1237                 }
1238                 break;
1239
1240         case SO_SNDTIMEO:
1241                 lv = sizeof(struct timeval);
1242                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
1243                         v.tm.tv_sec = 0;
1244                         v.tm.tv_usec = 0;
1245                 } else {
1246                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
1247                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * USEC_PER_SEC) / HZ;
1248                 }
1249                 break;
1250
1251         case SO_RCVLOWAT:
1252                 v.val = sk->sk_rcvlowat;
1253                 break;
1254
1255         case SO_SNDLOWAT:
1256                 v.val = 1;
1257                 break;
1258
1259         case SO_PASSCRED:
1260                 v.val = !!test_bit(SOCK_PASSCRED, &sock->flags);
1261                 break;
1262
1263         case SO_PEERCRED:
1264         {
1265                 struct ucred peercred;
1266                 if (len > sizeof(peercred))
1267                         len = sizeof(peercred);
1268                 cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred);
1269                 if (copy_to_user(optval, &peercred, len))
1270                         return -EFAULT;
1271                 goto lenout;
1272         }
1273
1274         case SO_PEERGROUPS:
1275         {
1276                 int ret, n;
1277
1278                 if (!sk->sk_peer_cred)
1279                         return -ENODATA;
1280
1281                 n = sk->sk_peer_cred->group_info->ngroups;
1282                 if (len < n * sizeof(gid_t)) {
1283                         len = n * sizeof(gid_t);
1284                         return put_user(len, optlen) ? -EFAULT : -ERANGE;
1285                 }
1286                 len = n * sizeof(gid_t);
1287
1288                 ret = groups_to_user((gid_t __user *)optval,
1289                                      sk->sk_peer_cred->group_info);
1290                 if (ret)
1291                         return ret;
1292                 goto lenout;
1293         }
1294
1295         case SO_PEERNAME:
1296         {
1297                 char address[128];
1298
1299                 lv = sock->ops->getname(sock, (struct sockaddr *)address, 2);
1300                 if (lv < 0)
1301                         return -ENOTCONN;
1302                 if (lv < len)
1303                         return -EINVAL;
1304                 if (copy_to_user(optval, address, len))
1305                         return -EFAULT;
1306                 goto lenout;
1307         }
1308
1309         /* Dubious BSD thing... Probably nobody even uses it, but
1310          * the UNIX standard wants it for whatever reason... -DaveM
1311          */
1312         case SO_ACCEPTCONN:
1313                 v.val = sk->sk_state == TCP_LISTEN;
1314                 break;
1315
1316         case SO_PASSSEC:
1317                 v.val = !!test_bit(SOCK_PASSSEC, &sock->flags);
1318                 break;
1319
1320         case SO_PEERSEC:
1321                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
1322
1323         case SO_MARK:
1324                 v.val = sk->sk_mark;
1325                 break;
1326
1327         case SO_RXQ_OVFL:
1328                 v.val = sock_flag(sk, SOCK_RXQ_OVFL);
1329                 break;
1330
1331         case SO_WIFI_STATUS:
1332                 v.val = sock_flag(sk, SOCK_WIFI_STATUS);
1333                 break;
1334
1335         case SO_PEEK_OFF:
1336                 if (!sock->ops->set_peek_off)
1337                         return -EOPNOTSUPP;
1338
1339                 v.val = sk->sk_peek_off;
1340                 break;
1341         case SO_NOFCS:
1342                 v.val = sock_flag(sk, SOCK_NOFCS);
1343                 break;
1344
1345         case SO_BINDTODEVICE:
1346                 return sock_getbindtodevice(sk, optval, optlen, len);
1347
1348         case SO_GET_FILTER:
1349                 len = sk_get_filter(sk, (struct sock_filter __user *)optval, len);
1350                 if (len < 0)
1351                         return len;
1352
1353                 goto lenout;
1354
1355         case SO_LOCK_FILTER:
1356                 v.val = sock_flag(sk, SOCK_FILTER_LOCKED);
1357                 break;
1358
1359         case SO_BPF_EXTENSIONS:
1360                 v.val = bpf_tell_extensions();
1361                 break;
1362
1363         case SO_SELECT_ERR_QUEUE:
1364                 v.val = sock_flag(sk, SOCK_SELECT_ERR_QUEUE);
1365                 break;
1366
1367 #ifdef CONFIG_NET_RX_BUSY_POLL
1368         case SO_BUSY_POLL:
1369                 v.val = sk->sk_ll_usec;
1370                 break;
1371 #endif
1372
1373         case SO_MAX_PACING_RATE:
1374                 v.val = sk->sk_max_pacing_rate;
1375                 break;
1376
1377         case SO_INCOMING_CPU:
1378                 v.val = sk->sk_incoming_cpu;
1379                 break;
1380
1381         case SO_MEMINFO:
1382         {
1383                 u32 meminfo[SK_MEMINFO_VARS];
1384
1385                 if (get_user(len, optlen))
1386                         return -EFAULT;
1387
1388                 sk_get_meminfo(sk, meminfo);
1389
1390                 len = min_t(unsigned int, len, sizeof(meminfo));
1391                 if (copy_to_user(optval, &meminfo, len))
1392                         return -EFAULT;
1393
1394                 goto lenout;
1395         }
1396
1397 #ifdef CONFIG_NET_RX_BUSY_POLL
1398         case SO_INCOMING_NAPI_ID:
1399                 v.val = READ_ONCE(sk->sk_napi_id);
1400
1401                 /* aggregate non-NAPI IDs down to 0 */
1402                 if (v.val < MIN_NAPI_ID)
1403                         v.val = 0;
1404
1405                 break;
1406 #endif
1407
1408         case SO_COOKIE:
1409                 lv = sizeof(u64);
1410                 if (len < lv)
1411                         return -EINVAL;
1412                 v.val64 = sock_gen_cookie(sk);
1413                 break;
1414
1415         case SO_ZEROCOPY:
1416                 v.val = sock_flag(sk, SOCK_ZEROCOPY);
1417                 break;
1418
1419         default:
1420                 /* We implement the SO_SNDLOWAT etc to not be settable
1421                  * (1003.1g 7).
1422                  */
1423                 return -ENOPROTOOPT;
1424         }
1425
1426         if (len > lv)
1427                 len = lv;
1428         if (copy_to_user(optval, &v, len))
1429                 return -EFAULT;
1430 lenout:
1431         if (put_user(len, optlen))
1432                 return -EFAULT;
1433         return 0;
1434 }
1435
1436 /*
1437  * Initialize an sk_lock.
1438  *
1439  * (We also register the sk_lock with the lock validator.)
1440  */
1441 static inline void sock_lock_init(struct sock *sk)
1442 {
1443         if (sk->sk_kern_sock)
1444                 sock_lock_init_class_and_name(
1445                         sk,
1446                         af_family_kern_slock_key_strings[sk->sk_family],
1447                         af_family_kern_slock_keys + sk->sk_family,
1448                         af_family_kern_key_strings[sk->sk_family],
1449                         af_family_kern_keys + sk->sk_family);
1450         else
1451                 sock_lock_init_class_and_name(
1452                         sk,
1453                         af_family_slock_key_strings[sk->sk_family],
1454                         af_family_slock_keys + sk->sk_family,
1455                         af_family_key_strings[sk->sk_family],
1456                         af_family_keys + sk->sk_family);
1457 }
1458
1459 /*
1460  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
1461  * even temporarly, because of RCU lookups. sk_node should also be left as is.
1462  * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
1463  */
1464 static void sock_copy(struct sock *nsk, const struct sock *osk)
1465 {
1466 #ifdef CONFIG_SECURITY_NETWORK
1467         void *sptr = nsk->sk_security;
1468 #endif
1469         memcpy(nsk, osk, offsetof(struct sock, sk_dontcopy_begin));
1470
1471         memcpy(&nsk->sk_dontcopy_end, &osk->sk_dontcopy_end,
1472                osk->sk_prot->obj_size - offsetof(struct sock, sk_dontcopy_end));
1473
1474 #ifdef CONFIG_SECURITY_NETWORK
1475         nsk->sk_security = sptr;
1476         security_sk_clone(osk, nsk);
1477 #endif
1478 }
1479
1480 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
1481                 int family)
1482 {
1483         struct sock *sk;
1484         struct kmem_cache *slab;
1485
1486         slab = prot->slab;
1487         if (slab != NULL) {
1488                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
1489                 if (!sk)
1490                         return sk;
1491                 if (priority & __GFP_ZERO)
1492                         sk_prot_clear_nulls(sk, prot->obj_size);
1493         } else
1494                 sk = kmalloc(prot->obj_size, priority);
1495
1496         if (sk != NULL) {
1497                 if (security_sk_alloc(sk, family, priority))
1498                         goto out_free;
1499
1500                 if (!try_module_get(prot->owner))
1501                         goto out_free_sec;
1502                 sk_tx_queue_clear(sk);
1503         }
1504
1505         return sk;
1506
1507 out_free_sec:
1508         security_sk_free(sk);
1509 out_free:
1510         if (slab != NULL)
1511                 kmem_cache_free(slab, sk);
1512         else
1513                 kfree(sk);
1514         return NULL;
1515 }
1516
1517 static void sk_prot_free(struct proto *prot, struct sock *sk)
1518 {
1519         struct kmem_cache *slab;
1520         struct module *owner;
1521
1522         owner = prot->owner;
1523         slab = prot->slab;
1524
1525         cgroup_sk_free(&sk->sk_cgrp_data);
1526         mem_cgroup_sk_free(sk);
1527         security_sk_free(sk);
1528         if (slab != NULL)
1529                 kmem_cache_free(slab, sk);
1530         else
1531                 kfree(sk);
1532         module_put(owner);
1533 }
1534
1535 /**
1536  *      sk_alloc - All socket objects are allocated here
1537  *      @net: the applicable net namespace
1538  *      @family: protocol family
1539  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1540  *      @prot: struct proto associated with this new sock instance
1541  *      @kern: is this to be a kernel socket?
1542  */
1543 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1544                       struct proto *prot, int kern)
1545 {
1546         struct sock *sk;
1547
1548         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1549         if (sk) {
1550                 sk->sk_family = family;
1551                 /*
1552                  * See comment in struct sock definition to understand
1553                  * why we need sk_prot_creator -acme
1554                  */
1555                 sk->sk_prot = sk->sk_prot_creator = prot;
1556                 sk->sk_kern_sock = kern;
1557                 sock_lock_init(sk);
1558                 sk->sk_net_refcnt = kern ? 0 : 1;
1559                 if (likely(sk->sk_net_refcnt)) {
1560                         get_net(net);
1561                         sock_inuse_add(net, 1);
1562                 }
1563
1564                 sock_net_set(sk, net);
1565                 refcount_set(&sk->sk_wmem_alloc, 1);
1566
1567                 mem_cgroup_sk_alloc(sk);
1568                 cgroup_sk_alloc(&sk->sk_cgrp_data);
1569                 sock_update_classid(&sk->sk_cgrp_data);
1570                 sock_update_netprioidx(&sk->sk_cgrp_data);
1571         }
1572
1573         return sk;
1574 }
1575 EXPORT_SYMBOL(sk_alloc);
1576
1577 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
1578  * grace period. This is the case for UDP sockets and TCP listeners.
1579  */
1580 static void __sk_destruct(struct rcu_head *head)
1581 {
1582         struct sock *sk = container_of(head, struct sock, sk_rcu);
1583         struct sk_filter *filter;
1584
1585         if (sk->sk_destruct)
1586                 sk->sk_destruct(sk);
1587
1588         filter = rcu_dereference_check(sk->sk_filter,
1589                                        refcount_read(&sk->sk_wmem_alloc) == 0);
1590         if (filter) {
1591                 sk_filter_uncharge(sk, filter);
1592                 RCU_INIT_POINTER(sk->sk_filter, NULL);
1593         }
1594         if (rcu_access_pointer(sk->sk_reuseport_cb))
1595                 reuseport_detach_sock(sk);
1596
1597         sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
1598
1599         if (atomic_read(&sk->sk_omem_alloc))
1600                 pr_debug("%s: optmem leakage (%d bytes) detected\n",
1601                          __func__, atomic_read(&sk->sk_omem_alloc));
1602
1603         if (sk->sk_frag.page) {
1604                 put_page(sk->sk_frag.page);
1605                 sk->sk_frag.page = NULL;
1606         }
1607
1608         if (sk->sk_peer_cred)
1609                 put_cred(sk->sk_peer_cred);
1610         put_pid(sk->sk_peer_pid);
1611         if (likely(sk->sk_net_refcnt))
1612                 put_net(sock_net(sk));
1613         sk_prot_free(sk->sk_prot_creator, sk);
1614 }
1615
1616 void sk_destruct(struct sock *sk)
1617 {
1618         if (sock_flag(sk, SOCK_RCU_FREE))
1619                 call_rcu(&sk->sk_rcu, __sk_destruct);
1620         else
1621                 __sk_destruct(&sk->sk_rcu);
1622 }
1623
1624 static void __sk_free(struct sock *sk)
1625 {
1626         if (likely(sk->sk_net_refcnt))
1627                 sock_inuse_add(sock_net(sk), -1);
1628
1629         if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
1630                 sock_diag_broadcast_destroy(sk);
1631         else
1632                 sk_destruct(sk);
1633 }
1634
1635 void sk_free(struct sock *sk)
1636 {
1637         /*
1638          * We subtract one from sk_wmem_alloc and can know if
1639          * some packets are still in some tx queue.
1640          * If not null, sock_wfree() will call __sk_free(sk) later
1641          */
1642         if (refcount_dec_and_test(&sk->sk_wmem_alloc))
1643                 __sk_free(sk);
1644 }
1645 EXPORT_SYMBOL(sk_free);
1646
1647 static void sk_init_common(struct sock *sk)
1648 {
1649         skb_queue_head_init(&sk->sk_receive_queue);
1650         skb_queue_head_init(&sk->sk_write_queue);
1651         skb_queue_head_init(&sk->sk_error_queue);
1652
1653         rwlock_init(&sk->sk_callback_lock);
1654         lockdep_set_class_and_name(&sk->sk_receive_queue.lock,
1655                         af_rlock_keys + sk->sk_family,
1656                         af_family_rlock_key_strings[sk->sk_family]);
1657         lockdep_set_class_and_name(&sk->sk_write_queue.lock,
1658                         af_wlock_keys + sk->sk_family,
1659                         af_family_wlock_key_strings[sk->sk_family]);
1660         lockdep_set_class_and_name(&sk->sk_error_queue.lock,
1661                         af_elock_keys + sk->sk_family,
1662                         af_family_elock_key_strings[sk->sk_family]);
1663         lockdep_set_class_and_name(&sk->sk_callback_lock,
1664                         af_callback_keys + sk->sk_family,
1665                         af_family_clock_key_strings[sk->sk_family]);
1666 }
1667
1668 /**
1669  *      sk_clone_lock - clone a socket, and lock its clone
1670  *      @sk: the socket to clone
1671  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1672  *
1673  *      Caller must unlock socket even in error path (bh_unlock_sock(newsk))
1674  */
1675 struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
1676 {
1677         struct sock *newsk;
1678         bool is_charged = true;
1679
1680         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1681         if (newsk != NULL) {
1682                 struct sk_filter *filter;
1683
1684                 sock_copy(newsk, sk);
1685
1686                 newsk->sk_prot_creator = sk->sk_prot;
1687
1688                 /* SANITY */
1689                 if (likely(newsk->sk_net_refcnt))
1690                         get_net(sock_net(newsk));
1691                 sk_node_init(&newsk->sk_node);
1692                 sock_lock_init(newsk);
1693                 bh_lock_sock(newsk);
1694                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1695                 newsk->sk_backlog.len = 0;
1696
1697                 atomic_set(&newsk->sk_rmem_alloc, 0);
1698                 /*
1699                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1700                  */
1701                 refcount_set(&newsk->sk_wmem_alloc, 1);
1702                 atomic_set(&newsk->sk_omem_alloc, 0);
1703                 sk_init_common(newsk);
1704
1705                 newsk->sk_dst_cache     = NULL;
1706                 newsk->sk_dst_pending_confirm = 0;
1707                 newsk->sk_wmem_queued   = 0;
1708                 newsk->sk_forward_alloc = 0;
1709                 atomic_set(&newsk->sk_drops, 0);
1710                 newsk->sk_send_head     = NULL;
1711                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1712                 atomic_set(&newsk->sk_zckey, 0);
1713
1714                 sock_reset_flag(newsk, SOCK_DONE);
1715                 mem_cgroup_sk_alloc(newsk);
1716                 cgroup_sk_alloc(&newsk->sk_cgrp_data);
1717
1718                 rcu_read_lock();
1719                 filter = rcu_dereference(sk->sk_filter);
1720                 if (filter != NULL)
1721                         /* though it's an empty new sock, the charging may fail
1722                          * if sysctl_optmem_max was changed between creation of
1723                          * original socket and cloning
1724                          */
1725                         is_charged = sk_filter_charge(newsk, filter);
1726                 RCU_INIT_POINTER(newsk->sk_filter, filter);
1727                 rcu_read_unlock();
1728
1729                 if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
1730                         /* We need to make sure that we don't uncharge the new
1731                          * socket if we couldn't charge it in the first place
1732                          * as otherwise we uncharge the parent's filter.
1733                          */
1734                         if (!is_charged)
1735                                 RCU_INIT_POINTER(newsk->sk_filter, NULL);
1736                         sk_free_unlock_clone(newsk);
1737                         newsk = NULL;
1738                         goto out;
1739                 }
1740                 RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
1741
1742                 newsk->sk_err      = 0;
1743                 newsk->sk_err_soft = 0;
1744                 newsk->sk_priority = 0;
1745                 newsk->sk_incoming_cpu = raw_smp_processor_id();
1746                 atomic64_set(&newsk->sk_cookie, 0);
1747                 if (likely(newsk->sk_net_refcnt))
1748                         sock_inuse_add(sock_net(newsk), 1);
1749
1750                 /*
1751                  * Before updating sk_refcnt, we must commit prior changes to memory
1752                  * (Documentation/RCU/rculist_nulls.txt for details)
1753                  */
1754                 smp_wmb();
1755                 refcount_set(&newsk->sk_refcnt, 2);
1756
1757                 /*
1758                  * Increment the counter in the same struct proto as the master
1759                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1760                  * is the same as sk->sk_prot->socks, as this field was copied
1761                  * with memcpy).
1762                  *
1763                  * This _changes_ the previous behaviour, where
1764                  * tcp_create_openreq_child always was incrementing the
1765                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1766                  * to be taken into account in all callers. -acme
1767                  */
1768                 sk_refcnt_debug_inc(newsk);
1769                 sk_set_socket(newsk, NULL);
1770                 newsk->sk_wq = NULL;
1771
1772                 if (newsk->sk_prot->sockets_allocated)
1773                         sk_sockets_allocated_inc(newsk);
1774
1775                 if (sock_needs_netstamp(sk) &&
1776                     newsk->sk_flags & SK_FLAGS_TIMESTAMP)
1777                         net_enable_timestamp();
1778         }
1779 out:
1780         return newsk;
1781 }
1782 EXPORT_SYMBOL_GPL(sk_clone_lock);
1783
1784 void sk_free_unlock_clone(struct sock *sk)
1785 {
1786         /* It is still raw copy of parent, so invalidate
1787          * destructor and make plain sk_free() */
1788         sk->sk_destruct = NULL;
1789         bh_unlock_sock(sk);
1790         sk_free(sk);
1791 }
1792 EXPORT_SYMBOL_GPL(sk_free_unlock_clone);
1793
1794 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1795 {
1796         u32 max_segs = 1;
1797
1798         sk_dst_set(sk, dst);
1799         sk->sk_route_caps = dst->dev->features | sk->sk_route_forced_caps;
1800         if (sk->sk_route_caps & NETIF_F_GSO)
1801                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1802         sk->sk_route_caps &= ~sk->sk_route_nocaps;
1803         if (sk_can_gso(sk)) {
1804                 if (dst->header_len && !xfrm_dst_offload_ok(dst)) {
1805                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1806                 } else {
1807                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1808                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1809                         max_segs = max_t(u32, dst->dev->gso_max_segs, 1);
1810                 }
1811         }
1812         sk->sk_gso_max_segs = max_segs;
1813 }
1814 EXPORT_SYMBOL_GPL(sk_setup_caps);
1815
1816 /*
1817  *      Simple resource managers for sockets.
1818  */
1819
1820
1821 /*
1822  * Write buffer destructor automatically called from kfree_skb.
1823  */
1824 void sock_wfree(struct sk_buff *skb)
1825 {
1826         struct sock *sk = skb->sk;
1827         unsigned int len = skb->truesize;
1828
1829         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1830                 /*
1831                  * Keep a reference on sk_wmem_alloc, this will be released
1832                  * after sk_write_space() call
1833                  */
1834                 WARN_ON(refcount_sub_and_test(len - 1, &sk->sk_wmem_alloc));
1835                 sk->sk_write_space(sk);
1836                 len = 1;
1837         }
1838         /*
1839          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1840          * could not do because of in-flight packets
1841          */
1842         if (refcount_sub_and_test(len, &sk->sk_wmem_alloc))
1843                 __sk_free(sk);
1844 }
1845 EXPORT_SYMBOL(sock_wfree);
1846
1847 /* This variant of sock_wfree() is used by TCP,
1848  * since it sets SOCK_USE_WRITE_QUEUE.
1849  */
1850 void __sock_wfree(struct sk_buff *skb)
1851 {
1852         struct sock *sk = skb->sk;
1853
1854         if (refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc))
1855                 __sk_free(sk);
1856 }
1857
1858 void skb_set_owner_w(struct sk_buff *skb, struct sock *sk)
1859 {
1860         skb_orphan(skb);
1861         skb->sk = sk;
1862 #ifdef CONFIG_INET
1863         if (unlikely(!sk_fullsock(sk))) {
1864                 skb->destructor = sock_edemux;
1865                 sock_hold(sk);
1866                 return;
1867         }
1868 #endif
1869         skb->destructor = sock_wfree;
1870         skb_set_hash_from_sk(skb, sk);
1871         /*
1872          * We used to take a refcount on sk, but following operation
1873          * is enough to guarantee sk_free() wont free this sock until
1874          * all in-flight packets are completed
1875          */
1876         refcount_add(skb->truesize, &sk->sk_wmem_alloc);
1877 }
1878 EXPORT_SYMBOL(skb_set_owner_w);
1879
1880 /* This helper is used by netem, as it can hold packets in its
1881  * delay queue. We want to allow the owner socket to send more
1882  * packets, as if they were already TX completed by a typical driver.
1883  * But we also want to keep skb->sk set because some packet schedulers
1884  * rely on it (sch_fq for example).
1885  */
1886 void skb_orphan_partial(struct sk_buff *skb)
1887 {
1888         if (skb_is_tcp_pure_ack(skb))
1889                 return;
1890
1891         if (skb->destructor == sock_wfree
1892 #ifdef CONFIG_INET
1893             || skb->destructor == tcp_wfree
1894 #endif
1895                 ) {
1896                 struct sock *sk = skb->sk;
1897
1898                 if (refcount_inc_not_zero(&sk->sk_refcnt)) {
1899                         WARN_ON(refcount_sub_and_test(skb->truesize, &sk->sk_wmem_alloc));
1900                         skb->destructor = sock_efree;
1901                 }
1902         } else {
1903                 skb_orphan(skb);
1904         }
1905 }
1906 EXPORT_SYMBOL(skb_orphan_partial);
1907
1908 /*
1909  * Read buffer destructor automatically called from kfree_skb.
1910  */
1911 void sock_rfree(struct sk_buff *skb)
1912 {
1913         struct sock *sk = skb->sk;
1914         unsigned int len = skb->truesize;
1915
1916         atomic_sub(len, &sk->sk_rmem_alloc);
1917         sk_mem_uncharge(sk, len);
1918 }
1919 EXPORT_SYMBOL(sock_rfree);
1920
1921 /*
1922  * Buffer destructor for skbs that are not used directly in read or write
1923  * path, e.g. for error handler skbs. Automatically called from kfree_skb.
1924  */
1925 void sock_efree(struct sk_buff *skb)
1926 {
1927         sock_put(skb->sk);
1928 }
1929 EXPORT_SYMBOL(sock_efree);
1930
1931 kuid_t sock_i_uid(struct sock *sk)
1932 {
1933         kuid_t uid;
1934
1935         read_lock_bh(&sk->sk_callback_lock);
1936         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : GLOBAL_ROOT_UID;
1937         read_unlock_bh(&sk->sk_callback_lock);
1938         return uid;
1939 }
1940 EXPORT_SYMBOL(sock_i_uid);
1941
1942 unsigned long sock_i_ino(struct sock *sk)
1943 {
1944         unsigned long ino;
1945
1946         read_lock_bh(&sk->sk_callback_lock);
1947         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1948         read_unlock_bh(&sk->sk_callback_lock);
1949         return ino;
1950 }
1951 EXPORT_SYMBOL(sock_i_ino);
1952
1953 /*
1954  * Allocate a skb from the socket's send buffer.
1955  */
1956 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1957                              gfp_t priority)
1958 {
1959         if (force || refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1960                 struct sk_buff *skb = alloc_skb(size, priority);
1961                 if (skb) {
1962                         skb_set_owner_w(skb, sk);
1963                         return skb;
1964                 }
1965         }
1966         return NULL;
1967 }
1968 EXPORT_SYMBOL(sock_wmalloc);
1969
1970 static void sock_ofree(struct sk_buff *skb)
1971 {
1972         struct sock *sk = skb->sk;
1973
1974         atomic_sub(skb->truesize, &sk->sk_omem_alloc);
1975 }
1976
1977 struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size,
1978                              gfp_t priority)
1979 {
1980         struct sk_buff *skb;
1981
1982         /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
1983         if (atomic_read(&sk->sk_omem_alloc) + SKB_TRUESIZE(size) >
1984             sysctl_optmem_max)
1985                 return NULL;
1986
1987         skb = alloc_skb(size, priority);
1988         if (!skb)
1989                 return NULL;
1990
1991         atomic_add(skb->truesize, &sk->sk_omem_alloc);
1992         skb->sk = sk;
1993         skb->destructor = sock_ofree;
1994         return skb;
1995 }
1996
1997 /*
1998  * Allocate a memory block from the socket's option memory buffer.
1999  */
2000 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
2001 {
2002         if ((unsigned int)size <= sysctl_optmem_max &&
2003             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
2004                 void *mem;
2005                 /* First do the add, to avoid the race if kmalloc
2006                  * might sleep.
2007                  */
2008                 atomic_add(size, &sk->sk_omem_alloc);
2009                 mem = kmalloc(size, priority);
2010                 if (mem)
2011                         return mem;
2012                 atomic_sub(size, &sk->sk_omem_alloc);
2013         }
2014         return NULL;
2015 }
2016 EXPORT_SYMBOL(sock_kmalloc);
2017
2018 /* Free an option memory block. Note, we actually want the inline
2019  * here as this allows gcc to detect the nullify and fold away the
2020  * condition entirely.
2021  */
2022 static inline void __sock_kfree_s(struct sock *sk, void *mem, int size,
2023                                   const bool nullify)
2024 {
2025         if (WARN_ON_ONCE(!mem))
2026                 return;
2027         if (nullify)
2028                 kzfree(mem);
2029         else
2030                 kfree(mem);
2031         atomic_sub(size, &sk->sk_omem_alloc);
2032 }
2033
2034 void sock_kfree_s(struct sock *sk, void *mem, int size)
2035 {
2036         __sock_kfree_s(sk, mem, size, false);
2037 }
2038 EXPORT_SYMBOL(sock_kfree_s);
2039
2040 void sock_kzfree_s(struct sock *sk, void *mem, int size)
2041 {
2042         __sock_kfree_s(sk, mem, size, true);
2043 }
2044 EXPORT_SYMBOL(sock_kzfree_s);
2045
2046 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2047    I think, these locks should be removed for datagram sockets.
2048  */
2049 static long sock_wait_for_wmem(struct sock *sk, long timeo)
2050 {
2051         DEFINE_WAIT(wait);
2052
2053         sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2054         for (;;) {
2055                 if (!timeo)
2056                         break;
2057                 if (signal_pending(current))
2058                         break;
2059                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2060                 prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
2061                 if (refcount_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
2062                         break;
2063                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2064                         break;
2065                 if (sk->sk_err)
2066                         break;
2067                 timeo = schedule_timeout(timeo);
2068         }
2069         finish_wait(sk_sleep(sk), &wait);
2070         return timeo;
2071 }
2072
2073
2074 /*
2075  *      Generic send/receive buffer handlers
2076  */
2077
2078 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
2079                                      unsigned long data_len, int noblock,
2080                                      int *errcode, int max_page_order)
2081 {
2082         struct sk_buff *skb;
2083         long timeo;
2084         int err;
2085
2086         timeo = sock_sndtimeo(sk, noblock);
2087         for (;;) {
2088                 err = sock_error(sk);
2089                 if (err != 0)
2090                         goto failure;
2091
2092                 err = -EPIPE;
2093                 if (sk->sk_shutdown & SEND_SHUTDOWN)
2094                         goto failure;
2095
2096                 if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
2097                         break;
2098
2099                 sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2100                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2101                 err = -EAGAIN;
2102                 if (!timeo)
2103                         goto failure;
2104                 if (signal_pending(current))
2105                         goto interrupted;
2106                 timeo = sock_wait_for_wmem(sk, timeo);
2107         }
2108         skb = alloc_skb_with_frags(header_len, data_len, max_page_order,
2109                                    errcode, sk->sk_allocation);
2110         if (skb)
2111                 skb_set_owner_w(skb, sk);
2112         return skb;
2113
2114 interrupted:
2115         err = sock_intr_errno(timeo);
2116 failure:
2117         *errcode = err;
2118         return NULL;
2119 }
2120 EXPORT_SYMBOL(sock_alloc_send_pskb);
2121
2122 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
2123                                     int noblock, int *errcode)
2124 {
2125         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode, 0);
2126 }
2127 EXPORT_SYMBOL(sock_alloc_send_skb);
2128
2129 int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg,
2130                      struct sockcm_cookie *sockc)
2131 {
2132         u32 tsflags;
2133
2134         switch (cmsg->cmsg_type) {
2135         case SO_MARK:
2136                 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
2137                         return -EPERM;
2138                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2139                         return -EINVAL;
2140                 sockc->mark = *(u32 *)CMSG_DATA(cmsg);
2141                 break;
2142         case SO_TIMESTAMPING:
2143                 if (cmsg->cmsg_len != CMSG_LEN(sizeof(u32)))
2144                         return -EINVAL;
2145
2146                 tsflags = *(u32 *)CMSG_DATA(cmsg);
2147                 if (tsflags & ~SOF_TIMESTAMPING_TX_RECORD_MASK)
2148                         return -EINVAL;
2149
2150                 sockc->tsflags &= ~SOF_TIMESTAMPING_TX_RECORD_MASK;
2151                 sockc->tsflags |= tsflags;
2152                 break;
2153         /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2154         case SCM_RIGHTS:
2155         case SCM_CREDENTIALS:
2156                 break;
2157         default:
2158                 return -EINVAL;
2159         }
2160         return 0;
2161 }
2162 EXPORT_SYMBOL(__sock_cmsg_send);
2163
2164 int sock_cmsg_send(struct sock *sk, struct msghdr *msg,
2165                    struct sockcm_cookie *sockc)
2166 {
2167         struct cmsghdr *cmsg;
2168         int ret;
2169
2170         for_each_cmsghdr(cmsg, msg) {
2171                 if (!CMSG_OK(msg, cmsg))
2172                         return -EINVAL;
2173                 if (cmsg->cmsg_level != SOL_SOCKET)
2174                         continue;
2175                 ret = __sock_cmsg_send(sk, msg, cmsg, sockc);
2176                 if (ret)
2177                         return ret;
2178         }
2179         return 0;
2180 }
2181 EXPORT_SYMBOL(sock_cmsg_send);
2182
2183 static void sk_enter_memory_pressure(struct sock *sk)
2184 {
2185         if (!sk->sk_prot->enter_memory_pressure)
2186                 return;
2187
2188         sk->sk_prot->enter_memory_pressure(sk);
2189 }
2190
2191 static void sk_leave_memory_pressure(struct sock *sk)
2192 {
2193         if (sk->sk_prot->leave_memory_pressure) {
2194                 sk->sk_prot->leave_memory_pressure(sk);
2195         } else {
2196                 unsigned long *memory_pressure = sk->sk_prot->memory_pressure;
2197
2198                 if (memory_pressure && *memory_pressure)
2199                         *memory_pressure = 0;
2200         }
2201 }
2202
2203 /* On 32bit arches, an skb frag is limited to 2^15 */
2204 #define SKB_FRAG_PAGE_ORDER     get_order(32768)
2205
2206 /**
2207  * skb_page_frag_refill - check that a page_frag contains enough room
2208  * @sz: minimum size of the fragment we want to get
2209  * @pfrag: pointer to page_frag
2210  * @gfp: priority for memory allocation
2211  *
2212  * Note: While this allocator tries to use high order pages, there is
2213  * no guarantee that allocations succeed. Therefore, @sz MUST be
2214  * less or equal than PAGE_SIZE.
2215  */
2216 bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
2217 {
2218         if (pfrag->page) {
2219                 if (page_ref_count(pfrag->page) == 1) {
2220                         pfrag->offset = 0;
2221                         return true;
2222                 }
2223                 if (pfrag->offset + sz <= pfrag->size)
2224                         return true;
2225                 put_page(pfrag->page);
2226         }
2227
2228         pfrag->offset = 0;
2229         if (SKB_FRAG_PAGE_ORDER) {
2230                 /* Avoid direct reclaim but allow kswapd to wake */
2231                 pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
2232                                           __GFP_COMP | __GFP_NOWARN |
2233                                           __GFP_NORETRY,
2234                                           SKB_FRAG_PAGE_ORDER);
2235                 if (likely(pfrag->page)) {
2236                         pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
2237                         return true;
2238                 }
2239         }
2240         pfrag->page = alloc_page(gfp);
2241         if (likely(pfrag->page)) {
2242                 pfrag->size = PAGE_SIZE;
2243                 return true;
2244         }
2245         return false;
2246 }
2247 EXPORT_SYMBOL(skb_page_frag_refill);
2248
2249 bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag)
2250 {
2251         if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation)))
2252                 return true;
2253
2254         sk_enter_memory_pressure(sk);
2255         sk_stream_moderate_sndbuf(sk);
2256         return false;
2257 }
2258 EXPORT_SYMBOL(sk_page_frag_refill);
2259
2260 int sk_alloc_sg(struct sock *sk, int len, struct scatterlist *sg,
2261                 int sg_start, int *sg_curr_index, unsigned int *sg_curr_size,
2262                 int first_coalesce)
2263 {
2264         int sg_curr = *sg_curr_index, use = 0, rc = 0;
2265         unsigned int size = *sg_curr_size;
2266         struct page_frag *pfrag;
2267         struct scatterlist *sge;
2268
2269         len -= size;
2270         pfrag = sk_page_frag(sk);
2271
2272         while (len > 0) {
2273                 unsigned int orig_offset;
2274
2275                 if (!sk_page_frag_refill(sk, pfrag)) {
2276                         rc = -ENOMEM;
2277                         goto out;
2278                 }
2279
2280                 use = min_t(int, len, pfrag->size - pfrag->offset);
2281
2282                 if (!sk_wmem_schedule(sk, use)) {
2283                         rc = -ENOMEM;
2284                         goto out;
2285                 }
2286
2287                 sk_mem_charge(sk, use);
2288                 size += use;
2289                 orig_offset = pfrag->offset;
2290                 pfrag->offset += use;
2291
2292                 sge = sg + sg_curr - 1;
2293                 if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
2294                     sg->offset + sg->length == orig_offset) {
2295                         sg->length += use;
2296                 } else {
2297                         sge = sg + sg_curr;
2298                         sg_unmark_end(sge);
2299                         sg_set_page(sge, pfrag->page, use, orig_offset);
2300                         get_page(pfrag->page);
2301                         sg_curr++;
2302
2303                         if (sg_curr == MAX_SKB_FRAGS)
2304                                 sg_curr = 0;
2305
2306                         if (sg_curr == sg_start) {
2307                                 rc = -ENOSPC;
2308                                 break;
2309                         }
2310                 }
2311
2312                 len -= use;
2313         }
2314 out:
2315         *sg_curr_size = size;
2316         *sg_curr_index = sg_curr;
2317         return rc;
2318 }
2319 EXPORT_SYMBOL(sk_alloc_sg);
2320
2321 static void __lock_sock(struct sock *sk)
2322         __releases(&sk->sk_lock.slock)
2323         __acquires(&sk->sk_lock.slock)
2324 {
2325         DEFINE_WAIT(wait);
2326
2327         for (;;) {
2328                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
2329                                         TASK_UNINTERRUPTIBLE);
2330                 spin_unlock_bh(&sk->sk_lock.slock);
2331                 schedule();
2332                 spin_lock_bh(&sk->sk_lock.slock);
2333                 if (!sock_owned_by_user(sk))
2334                         break;
2335         }
2336         finish_wait(&sk->sk_lock.wq, &wait);
2337 }
2338
2339 static void __release_sock(struct sock *sk)
2340         __releases(&sk->sk_lock.slock)
2341         __acquires(&sk->sk_lock.slock)
2342 {
2343         struct sk_buff *skb, *next;
2344
2345         while ((skb = sk->sk_backlog.head) != NULL) {
2346                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
2347
2348                 spin_unlock_bh(&sk->sk_lock.slock);
2349
2350                 do {
2351                         next = skb->next;
2352                         prefetch(next);
2353                         WARN_ON_ONCE(skb_dst_is_noref(skb));
2354                         skb->next = NULL;
2355                         sk_backlog_rcv(sk, skb);
2356
2357                         cond_resched();
2358
2359                         skb = next;
2360                 } while (skb != NULL);
2361
2362                 spin_lock_bh(&sk->sk_lock.slock);
2363         }
2364
2365         /*
2366          * Doing the zeroing here guarantee we can not loop forever
2367          * while a wild producer attempts to flood us.
2368          */
2369         sk->sk_backlog.len = 0;
2370 }
2371
2372 void __sk_flush_backlog(struct sock *sk)
2373 {
2374         spin_lock_bh(&sk->sk_lock.slock);
2375         __release_sock(sk);
2376         spin_unlock_bh(&sk->sk_lock.slock);
2377 }
2378
2379 /**
2380  * sk_wait_data - wait for data to arrive at sk_receive_queue
2381  * @sk:    sock to wait on
2382  * @timeo: for how long
2383  * @skb:   last skb seen on sk_receive_queue
2384  *
2385  * Now socket state including sk->sk_err is changed only under lock,
2386  * hence we may omit checks after joining wait queue.
2387  * We check receive queue before schedule() only as optimization;
2388  * it is very likely that release_sock() added new data.
2389  */
2390 int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
2391 {
2392         DEFINE_WAIT_FUNC(wait, woken_wake_function);
2393         int rc;
2394
2395         add_wait_queue(sk_sleep(sk), &wait);
2396         sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2397         rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb, &wait);
2398         sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
2399         remove_wait_queue(sk_sleep(sk), &wait);
2400         return rc;
2401 }
2402 EXPORT_SYMBOL(sk_wait_data);
2403
2404 /**
2405  *      __sk_mem_raise_allocated - increase memory_allocated
2406  *      @sk: socket
2407  *      @size: memory size to allocate
2408  *      @amt: pages to allocate
2409  *      @kind: allocation type
2410  *
2411  *      Similar to __sk_mem_schedule(), but does not update sk_forward_alloc
2412  */
2413 int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind)
2414 {
2415         struct proto *prot = sk->sk_prot;
2416         long allocated = sk_memory_allocated_add(sk, amt);
2417
2418         if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
2419             !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
2420                 goto suppress_allocation;
2421
2422         /* Under limit. */
2423         if (allocated <= sk_prot_mem_limits(sk, 0)) {
2424                 sk_leave_memory_pressure(sk);
2425                 return 1;
2426         }
2427
2428         /* Under pressure. */
2429         if (allocated > sk_prot_mem_limits(sk, 1))
2430                 sk_enter_memory_pressure(sk);
2431
2432         /* Over hard limit. */
2433         if (allocated > sk_prot_mem_limits(sk, 2))
2434                 goto suppress_allocation;
2435
2436         /* guarantee minimum buffer size under pressure */
2437         if (kind == SK_MEM_RECV) {
2438                 if (atomic_read(&sk->sk_rmem_alloc) < sk_get_rmem0(sk, prot))
2439                         return 1;
2440
2441         } else { /* SK_MEM_SEND */
2442                 int wmem0 = sk_get_wmem0(sk, prot);
2443
2444                 if (sk->sk_type == SOCK_STREAM) {
2445                         if (sk->sk_wmem_queued < wmem0)
2446                                 return 1;
2447                 } else if (refcount_read(&sk->sk_wmem_alloc) < wmem0) {
2448                                 return 1;
2449                 }
2450         }
2451
2452         if (sk_has_memory_pressure(sk)) {
2453                 int alloc;
2454
2455                 if (!sk_under_memory_pressure(sk))
2456                         return 1;
2457                 alloc = sk_sockets_allocated_read_positive(sk);
2458                 if (sk_prot_mem_limits(sk, 2) > alloc *
2459                     sk_mem_pages(sk->sk_wmem_queued +
2460                                  atomic_read(&sk->sk_rmem_alloc) +
2461                                  sk->sk_forward_alloc))
2462                         return 1;
2463         }
2464
2465 suppress_allocation:
2466
2467         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
2468                 sk_stream_moderate_sndbuf(sk);
2469
2470                 /* Fail only if socket is _under_ its sndbuf.
2471                  * In this case we cannot block, so that we have to fail.
2472                  */
2473                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
2474                         return 1;
2475         }
2476
2477         trace_sock_exceed_buf_limit(sk, prot, allocated);
2478
2479         sk_memory_allocated_sub(sk, amt);
2480
2481         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2482                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
2483
2484         return 0;
2485 }
2486 EXPORT_SYMBOL(__sk_mem_raise_allocated);
2487
2488 /**
2489  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
2490  *      @sk: socket
2491  *      @size: memory size to allocate
2492  *      @kind: allocation type
2493  *
2494  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
2495  *      rmem allocation. This function assumes that protocols which have
2496  *      memory_pressure use sk_wmem_queued as write buffer accounting.
2497  */
2498 int __sk_mem_schedule(struct sock *sk, int size, int kind)
2499 {
2500         int ret, amt = sk_mem_pages(size);
2501
2502         sk->sk_forward_alloc += amt << SK_MEM_QUANTUM_SHIFT;
2503         ret = __sk_mem_raise_allocated(sk, size, amt, kind);
2504         if (!ret)
2505                 sk->sk_forward_alloc -= amt << SK_MEM_QUANTUM_SHIFT;
2506         return ret;
2507 }
2508 EXPORT_SYMBOL(__sk_mem_schedule);
2509
2510 /**
2511  *      __sk_mem_reduce_allocated - reclaim memory_allocated
2512  *      @sk: socket
2513  *      @amount: number of quanta
2514  *
2515  *      Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
2516  */
2517 void __sk_mem_reduce_allocated(struct sock *sk, int amount)
2518 {
2519         sk_memory_allocated_sub(sk, amount);
2520
2521         if (mem_cgroup_sockets_enabled && sk->sk_memcg)
2522                 mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
2523
2524         if (sk_under_memory_pressure(sk) &&
2525             (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
2526                 sk_leave_memory_pressure(sk);
2527 }
2528 EXPORT_SYMBOL(__sk_mem_reduce_allocated);
2529
2530 /**
2531  *      __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
2532  *      @sk: socket
2533  *      @amount: number of bytes (rounded down to a SK_MEM_QUANTUM multiple)
2534  */
2535 void __sk_mem_reclaim(struct sock *sk, int amount)
2536 {
2537         amount >>= SK_MEM_QUANTUM_SHIFT;
2538         sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
2539         __sk_mem_reduce_allocated(sk, amount);
2540 }
2541 EXPORT_SYMBOL(__sk_mem_reclaim);
2542
2543 int sk_set_peek_off(struct sock *sk, int val)
2544 {
2545         sk->sk_peek_off = val;
2546         return 0;
2547 }
2548 EXPORT_SYMBOL_GPL(sk_set_peek_off);
2549
2550 /*
2551  * Set of default routines for initialising struct proto_ops when
2552  * the protocol does not support a particular function. In certain
2553  * cases where it makes no sense for a protocol to have a "do nothing"
2554  * function, some default processing is provided.
2555  */
2556
2557 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
2558 {
2559         return -EOPNOTSUPP;
2560 }
2561 EXPORT_SYMBOL(sock_no_bind);
2562
2563 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
2564                     int len, int flags)
2565 {
2566         return -EOPNOTSUPP;
2567 }
2568 EXPORT_SYMBOL(sock_no_connect);
2569
2570 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
2571 {
2572         return -EOPNOTSUPP;
2573 }
2574 EXPORT_SYMBOL(sock_no_socketpair);
2575
2576 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags,
2577                    bool kern)
2578 {
2579         return -EOPNOTSUPP;
2580 }
2581 EXPORT_SYMBOL(sock_no_accept);
2582
2583 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
2584                     int peer)
2585 {
2586         return -EOPNOTSUPP;
2587 }
2588 EXPORT_SYMBOL(sock_no_getname);
2589
2590 __poll_t sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
2591 {
2592         return 0;
2593 }
2594 EXPORT_SYMBOL(sock_no_poll);
2595
2596 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
2597 {
2598         return -EOPNOTSUPP;
2599 }
2600 EXPORT_SYMBOL(sock_no_ioctl);
2601
2602 int sock_no_listen(struct socket *sock, int backlog)
2603 {
2604         return -EOPNOTSUPP;
2605 }
2606 EXPORT_SYMBOL(sock_no_listen);
2607
2608 int sock_no_shutdown(struct socket *sock, int how)
2609 {
2610         return -EOPNOTSUPP;
2611 }
2612 EXPORT_SYMBOL(sock_no_shutdown);
2613
2614 int sock_no_setsockopt(struct socket *sock, int level, int optname,
2615                     char __user *optval, unsigned int optlen)
2616 {
2617         return -EOPNOTSUPP;
2618 }
2619 EXPORT_SYMBOL(sock_no_setsockopt);
2620
2621 int sock_no_getsockopt(struct socket *sock, int level, int optname,
2622                     char __user *optval, int __user *optlen)
2623 {
2624         return -EOPNOTSUPP;
2625 }
2626 EXPORT_SYMBOL(sock_no_getsockopt);
2627
2628 int sock_no_sendmsg(struct socket *sock, struct msghdr *m, size_t len)
2629 {
2630         return -EOPNOTSUPP;
2631 }
2632 EXPORT_SYMBOL(sock_no_sendmsg);
2633
2634 int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *m, size_t len)
2635 {
2636         return -EOPNOTSUPP;
2637 }
2638 EXPORT_SYMBOL(sock_no_sendmsg_locked);
2639
2640 int sock_no_recvmsg(struct socket *sock, struct msghdr *m, size_t len,
2641                     int flags)
2642 {
2643         return -EOPNOTSUPP;
2644 }
2645 EXPORT_SYMBOL(sock_no_recvmsg);
2646
2647 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
2648 {
2649         /* Mirror missing mmap method error code */
2650         return -ENODEV;
2651 }
2652 EXPORT_SYMBOL(sock_no_mmap);
2653
2654 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
2655 {
2656         ssize_t res;
2657         struct msghdr msg = {.msg_flags = flags};
2658         struct kvec iov;
2659         char *kaddr = kmap(page);
2660         iov.iov_base = kaddr + offset;
2661         iov.iov_len = size;
2662         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
2663         kunmap(page);
2664         return res;
2665 }
2666 EXPORT_SYMBOL(sock_no_sendpage);
2667
2668 ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page,
2669                                 int offset, size_t size, int flags)
2670 {
2671         ssize_t res;
2672         struct msghdr msg = {.msg_flags = flags};
2673         struct kvec iov;
2674         char *kaddr = kmap(page);
2675
2676         iov.iov_base = kaddr + offset;
2677         iov.iov_len = size;
2678         res = kernel_sendmsg_locked(sk, &msg, &iov, 1, size);
2679         kunmap(page);
2680         return res;
2681 }
2682 EXPORT_SYMBOL(sock_no_sendpage_locked);
2683
2684 /*
2685  *      Default Socket Callbacks
2686  */
2687
2688 static void sock_def_wakeup(struct sock *sk)
2689 {
2690         struct socket_wq *wq;
2691
2692         rcu_read_lock();
2693         wq = rcu_dereference(sk->sk_wq);
2694         if (skwq_has_sleeper(wq))
2695                 wake_up_interruptible_all(&wq->wait);
2696         rcu_read_unlock();
2697 }
2698
2699 static void sock_def_error_report(struct sock *sk)
2700 {
2701         struct socket_wq *wq;
2702
2703         rcu_read_lock();
2704         wq = rcu_dereference(sk->sk_wq);
2705         if (skwq_has_sleeper(wq))
2706                 wake_up_interruptible_poll(&wq->wait, EPOLLERR);
2707         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
2708         rcu_read_unlock();
2709 }
2710
2711 static void sock_def_readable(struct sock *sk)
2712 {
2713         struct socket_wq *wq;
2714
2715         rcu_read_lock();
2716         wq = rcu_dereference(sk->sk_wq);
2717         if (skwq_has_sleeper(wq))
2718                 wake_up_interruptible_sync_poll(&wq->wait, EPOLLIN | EPOLLPRI |
2719                                                 EPOLLRDNORM | EPOLLRDBAND);
2720         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
2721         rcu_read_unlock();
2722 }
2723
2724 static void sock_def_write_space(struct sock *sk)
2725 {
2726         struct socket_wq *wq;
2727
2728         rcu_read_lock();
2729
2730         /* Do not wake up a writer until he can make "significant"
2731          * progress.  --DaveM
2732          */
2733         if ((refcount_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
2734                 wq = rcu_dereference(sk->sk_wq);
2735                 if (skwq_has_sleeper(wq))
2736                         wake_up_interruptible_sync_poll(&wq->wait, EPOLLOUT |
2737                                                 EPOLLWRNORM | EPOLLWRBAND);
2738
2739                 /* Should agree with poll, otherwise some programs break */
2740                 if (sock_writeable(sk))
2741                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
2742         }
2743
2744         rcu_read_unlock();
2745 }
2746
2747 static void sock_def_destruct(struct sock *sk)
2748 {
2749 }
2750
2751 void sk_send_sigurg(struct sock *sk)
2752 {
2753         if (sk->sk_socket && sk->sk_socket->file)
2754                 if (send_sigurg(&sk->sk_socket->file->f_owner))
2755                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
2756 }
2757 EXPORT_SYMBOL(sk_send_sigurg);
2758
2759 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
2760                     unsigned long expires)
2761 {
2762         if (!mod_timer(timer, expires))
2763                 sock_hold(sk);
2764 }
2765 EXPORT_SYMBOL(sk_reset_timer);
2766
2767 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
2768 {
2769         if (del_timer(timer))
2770                 __sock_put(sk);
2771 }
2772 EXPORT_SYMBOL(sk_stop_timer);
2773
2774 void sock_init_data(struct socket *sock, struct sock *sk)
2775 {
2776         sk_init_common(sk);
2777         sk->sk_send_head        =       NULL;
2778
2779         timer_setup(&sk->sk_timer, NULL, 0);
2780
2781         sk->sk_allocation       =       GFP_KERNEL;
2782         sk->sk_rcvbuf           =       sysctl_rmem_default;
2783         sk->sk_sndbuf           =       sysctl_wmem_default;
2784         sk->sk_state            =       TCP_CLOSE;
2785         sk_set_socket(sk, sock);
2786
2787         sock_set_flag(sk, SOCK_ZAPPED);
2788
2789         if (sock) {
2790                 sk->sk_type     =       sock->type;
2791                 sk->sk_wq       =       sock->wq;
2792                 sock->sk        =       sk;
2793                 sk->sk_uid      =       SOCK_INODE(sock)->i_uid;
2794         } else {
2795                 sk->sk_wq       =       NULL;
2796                 sk->sk_uid      =       make_kuid(sock_net(sk)->user_ns, 0);
2797         }
2798
2799         rwlock_init(&sk->sk_callback_lock);
2800         if (sk->sk_kern_sock)
2801                 lockdep_set_class_and_name(
2802                         &sk->sk_callback_lock,
2803                         af_kern_callback_keys + sk->sk_family,
2804                         af_family_kern_clock_key_strings[sk->sk_family]);
2805         else
2806                 lockdep_set_class_and_name(
2807                         &sk->sk_callback_lock,
2808                         af_callback_keys + sk->sk_family,
2809                         af_family_clock_key_strings[sk->sk_family]);
2810
2811         sk->sk_state_change     =       sock_def_wakeup;
2812         sk->sk_data_ready       =       sock_def_readable;
2813         sk->sk_write_space      =       sock_def_write_space;
2814         sk->sk_error_report     =       sock_def_error_report;
2815         sk->sk_destruct         =       sock_def_destruct;
2816
2817         sk->sk_frag.page        =       NULL;
2818         sk->sk_frag.offset      =       0;
2819         sk->sk_peek_off         =       -1;
2820
2821         sk->sk_peer_pid         =       NULL;
2822         sk->sk_peer_cred        =       NULL;
2823         sk->sk_write_pending    =       0;
2824         sk->sk_rcvlowat         =       1;
2825         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
2826         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
2827
2828         sk->sk_stamp = SK_DEFAULT_STAMP;
2829         atomic_set(&sk->sk_zckey, 0);
2830
2831 #ifdef CONFIG_NET_RX_BUSY_POLL
2832         sk->sk_napi_id          =       0;
2833         sk->sk_ll_usec          =       sysctl_net_busy_read;
2834 #endif
2835
2836         sk->sk_max_pacing_rate = ~0U;
2837         sk->sk_pacing_rate = ~0U;
2838         sk->sk_pacing_shift = 10;
2839         sk->sk_incoming_cpu = -1;
2840         /*
2841          * Before updating sk_refcnt, we must commit prior changes to memory
2842          * (Documentation/RCU/rculist_nulls.txt for details)
2843          */
2844         smp_wmb();
2845         refcount_set(&sk->sk_refcnt, 1);
2846         atomic_set(&sk->sk_drops, 0);
2847 }
2848 EXPORT_SYMBOL(sock_init_data);
2849
2850 void lock_sock_nested(struct sock *sk, int subclass)
2851 {
2852         might_sleep();
2853         spin_lock_bh(&sk->sk_lock.slock);
2854         if (sk->sk_lock.owned)
2855                 __lock_sock(sk);
2856         sk->sk_lock.owned = 1;
2857         spin_unlock(&sk->sk_lock.slock);
2858         /*
2859          * The sk_lock has mutex_lock() semantics here:
2860          */
2861         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
2862         local_bh_enable();
2863 }
2864 EXPORT_SYMBOL(lock_sock_nested);
2865
2866 void release_sock(struct sock *sk)
2867 {
2868         spin_lock_bh(&sk->sk_lock.slock);
2869         if (sk->sk_backlog.tail)
2870                 __release_sock(sk);
2871
2872         /* Warning : release_cb() might need to release sk ownership,
2873          * ie call sock_release_ownership(sk) before us.
2874          */
2875         if (sk->sk_prot->release_cb)
2876                 sk->sk_prot->release_cb(sk);
2877
2878         sock_release_ownership(sk);
2879         if (waitqueue_active(&sk->sk_lock.wq))
2880                 wake_up(&sk->sk_lock.wq);
2881         spin_unlock_bh(&sk->sk_lock.slock);
2882 }
2883 EXPORT_SYMBOL(release_sock);
2884
2885 /**
2886  * lock_sock_fast - fast version of lock_sock
2887  * @sk: socket
2888  *
2889  * This version should be used for very small section, where process wont block
2890  * return false if fast path is taken:
2891  *
2892  *   sk_lock.slock locked, owned = 0, BH disabled
2893  *
2894  * return true if slow path is taken:
2895  *
2896  *   sk_lock.slock unlocked, owned = 1, BH enabled
2897  */
2898 bool lock_sock_fast(struct sock *sk)
2899 {
2900         might_sleep();
2901         spin_lock_bh(&sk->sk_lock.slock);
2902
2903         if (!sk->sk_lock.owned)
2904                 /*
2905                  * Note : We must disable BH
2906                  */
2907                 return false;
2908
2909         __lock_sock(sk);
2910         sk->sk_lock.owned = 1;
2911         spin_unlock(&sk->sk_lock.slock);
2912         /*
2913          * The sk_lock has mutex_lock() semantics here:
2914          */
2915         mutex_acquire(&sk->sk_lock.dep_map, 0, 0, _RET_IP_);
2916         local_bh_enable();
2917         return true;
2918 }
2919 EXPORT_SYMBOL(lock_sock_fast);
2920
2921 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
2922 {
2923         struct timeval tv;
2924         if (!sock_flag(sk, SOCK_TIMESTAMP))
2925                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2926         tv = ktime_to_timeval(sk->sk_stamp);
2927         if (tv.tv_sec == -1)
2928                 return -ENOENT;
2929         if (tv.tv_sec == 0) {
2930                 sk->sk_stamp = ktime_get_real();
2931                 tv = ktime_to_timeval(sk->sk_stamp);
2932         }
2933         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
2934 }
2935 EXPORT_SYMBOL(sock_get_timestamp);
2936
2937 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
2938 {
2939         struct timespec ts;
2940         if (!sock_flag(sk, SOCK_TIMESTAMP))
2941                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
2942         ts = ktime_to_timespec(sk->sk_stamp);
2943         if (ts.tv_sec == -1)
2944                 return -ENOENT;
2945         if (ts.tv_sec == 0) {
2946                 sk->sk_stamp = ktime_get_real();
2947                 ts = ktime_to_timespec(sk->sk_stamp);
2948         }
2949         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
2950 }
2951 EXPORT_SYMBOL(sock_get_timestampns);
2952
2953 void sock_enable_timestamp(struct sock *sk, int flag)
2954 {
2955         if (!sock_flag(sk, flag)) {
2956                 unsigned long previous_flags = sk->sk_flags;
2957
2958                 sock_set_flag(sk, flag);
2959                 /*
2960                  * we just set one of the two flags which require net
2961                  * time stamping, but time stamping might have been on
2962                  * already because of the other one
2963                  */
2964                 if (sock_needs_netstamp(sk) &&
2965                     !(previous_flags & SK_FLAGS_TIMESTAMP))
2966                         net_enable_timestamp();
2967         }
2968 }
2969
2970 int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
2971                        int level, int type)
2972 {
2973         struct sock_exterr_skb *serr;
2974         struct sk_buff *skb;
2975         int copied, err;
2976
2977         err = -EAGAIN;
2978         skb = sock_dequeue_err_skb(sk);
2979         if (skb == NULL)
2980                 goto out;
2981
2982         copied = skb->len;
2983         if (copied > len) {
2984                 msg->msg_flags |= MSG_TRUNC;
2985                 copied = len;
2986         }
2987         err = skb_copy_datagram_msg(skb, 0, msg, copied);
2988         if (err)
2989                 goto out_free_skb;
2990
2991         sock_recv_timestamp(msg, sk, skb);
2992
2993         serr = SKB_EXT_ERR(skb);
2994         put_cmsg(msg, level, type, sizeof(serr->ee), &serr->ee);
2995
2996         msg->msg_flags |= MSG_ERRQUEUE;
2997         err = copied;
2998
2999 out_free_skb:
3000         kfree_skb(skb);
3001 out:
3002         return err;
3003 }
3004 EXPORT_SYMBOL(sock_recv_errqueue);
3005
3006 /*
3007  *      Get a socket option on an socket.
3008  *
3009  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
3010  *      asynchronous errors should be reported by getsockopt. We assume
3011  *      this means if you specify SO_ERROR (otherwise whats the point of it).
3012  */
3013 int sock_common_getsockopt(struct socket *sock, int level, int optname,
3014                            char __user *optval, int __user *optlen)
3015 {
3016         struct sock *sk = sock->sk;
3017
3018         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3019 }
3020 EXPORT_SYMBOL(sock_common_getsockopt);
3021
3022 #ifdef CONFIG_COMPAT
3023 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
3024                                   char __user *optval, int __user *optlen)
3025 {
3026         struct sock *sk = sock->sk;
3027
3028         if (sk->sk_prot->compat_getsockopt != NULL)
3029                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
3030                                                       optval, optlen);
3031         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
3032 }
3033 EXPORT_SYMBOL(compat_sock_common_getsockopt);
3034 #endif
3035
3036 int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
3037                         int flags)
3038 {
3039         struct sock *sk = sock->sk;
3040         int addr_len = 0;
3041         int err;
3042
3043         err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT,
3044                                    flags & ~MSG_DONTWAIT, &addr_len);
3045         if (err >= 0)
3046                 msg->msg_namelen = addr_len;
3047         return err;
3048 }
3049 EXPORT_SYMBOL(sock_common_recvmsg);
3050
3051 /*
3052  *      Set socket options on an inet socket.
3053  */
3054 int sock_common_setsockopt(struct socket *sock, int level, int optname,
3055                            char __user *optval, unsigned int optlen)
3056 {
3057         struct sock *sk = sock->sk;
3058
3059         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3060 }
3061 EXPORT_SYMBOL(sock_common_setsockopt);
3062
3063 #ifdef CONFIG_COMPAT
3064 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
3065                                   char __user *optval, unsigned int optlen)
3066 {
3067         struct sock *sk = sock->sk;
3068
3069         if (sk->sk_prot->compat_setsockopt != NULL)
3070                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
3071                                                       optval, optlen);
3072         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
3073 }
3074 EXPORT_SYMBOL(compat_sock_common_setsockopt);
3075 #endif
3076
3077 void sk_common_release(struct sock *sk)
3078 {
3079         if (sk->sk_prot->destroy)
3080                 sk->sk_prot->destroy(sk);
3081
3082         /*
3083          * Observation: when sock_common_release is called, processes have
3084          * no access to socket. But net still has.
3085          * Step one, detach it from networking:
3086          *
3087          * A. Remove from hash tables.
3088          */
3089
3090         sk->sk_prot->unhash(sk);
3091
3092         /*
3093          * In this point socket cannot receive new packets, but it is possible
3094          * that some packets are in flight because some CPU runs receiver and
3095          * did hash table lookup before we unhashed socket. They will achieve
3096          * receive queue and will be purged by socket destructor.
3097          *
3098          * Also we still have packets pending on receive queue and probably,
3099          * our own packets waiting in device queues. sock_destroy will drain
3100          * receive queue, but transmitted packets will delay socket destruction
3101          * until the last reference will be released.
3102          */
3103
3104         sock_orphan(sk);
3105
3106         xfrm_sk_free_policy(sk);
3107
3108         sk_refcnt_debug_release(sk);
3109
3110         sock_put(sk);
3111 }
3112 EXPORT_SYMBOL(sk_common_release);
3113
3114 void sk_get_meminfo(const struct sock *sk, u32 *mem)
3115 {
3116         memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS);
3117
3118         mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
3119         mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
3120         mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
3121         mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
3122         mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
3123         mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
3124         mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
3125         mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len;
3126         mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops);
3127 }
3128
3129 #ifdef CONFIG_PROC_FS
3130 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
3131 struct prot_inuse {
3132         int val[PROTO_INUSE_NR];
3133 };
3134
3135 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
3136
3137 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
3138 {
3139         __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val);
3140 }
3141 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
3142
3143 int sock_prot_inuse_get(struct net *net, struct proto *prot)
3144 {
3145         int cpu, idx = prot->inuse_idx;
3146         int res = 0;
3147
3148         for_each_possible_cpu(cpu)
3149                 res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx];
3150
3151         return res >= 0 ? res : 0;
3152 }
3153 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
3154
3155 static void sock_inuse_add(struct net *net, int val)
3156 {
3157         this_cpu_add(*net->core.sock_inuse, val);
3158 }
3159
3160 int sock_inuse_get(struct net *net)
3161 {
3162         int cpu, res = 0;
3163
3164         for_each_possible_cpu(cpu)
3165                 res += *per_cpu_ptr(net->core.sock_inuse, cpu);
3166
3167         return res;
3168 }
3169
3170 EXPORT_SYMBOL_GPL(sock_inuse_get);
3171
3172 static int __net_init sock_inuse_init_net(struct net *net)
3173 {
3174         net->core.prot_inuse = alloc_percpu(struct prot_inuse);
3175         if (net->core.prot_inuse == NULL)
3176                 return -ENOMEM;
3177
3178         net->core.sock_inuse = alloc_percpu(int);
3179         if (net->core.sock_inuse == NULL)
3180                 goto out;
3181
3182         return 0;
3183
3184 out:
3185         free_percpu(net->core.prot_inuse);
3186         return -ENOMEM;
3187 }
3188
3189 static void __net_exit sock_inuse_exit_net(struct net *net)
3190 {
3191         free_percpu(net->core.prot_inuse);
3192         free_percpu(net->core.sock_inuse);
3193 }
3194
3195 static struct pernet_operations net_inuse_ops = {
3196         .init = sock_inuse_init_net,
3197         .exit = sock_inuse_exit_net,
3198 };
3199
3200 static __init int net_inuse_init(void)
3201 {
3202         if (register_pernet_subsys(&net_inuse_ops))
3203                 panic("Cannot initialize net inuse counters");
3204
3205         return 0;
3206 }
3207
3208 core_initcall(net_inuse_init);
3209
3210 static void assign_proto_idx(struct proto *prot)
3211 {
3212         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
3213
3214         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
3215                 pr_err("PROTO_INUSE_NR exhausted\n");
3216                 return;
3217         }
3218
3219         set_bit(prot->inuse_idx, proto_inuse_idx);
3220 }
3221
3222 static void release_proto_idx(struct proto *prot)
3223 {
3224         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
3225                 clear_bit(prot->inuse_idx, proto_inuse_idx);
3226 }
3227 #else
3228 static inline void assign_proto_idx(struct proto *prot)
3229 {
3230 }
3231
3232 static inline void release_proto_idx(struct proto *prot)
3233 {
3234 }
3235
3236 static void sock_inuse_add(struct net *net, int val)
3237 {
3238 }
3239 #endif
3240
3241 static void req_prot_cleanup(struct request_sock_ops *rsk_prot)
3242 {
3243         if (!rsk_prot)
3244                 return;
3245         kfree(rsk_prot->slab_name);
3246         rsk_prot->slab_name = NULL;
3247         kmem_cache_destroy(rsk_prot->slab);
3248         rsk_prot->slab = NULL;
3249 }
3250
3251 static int req_prot_init(const struct proto *prot)
3252 {
3253         struct request_sock_ops *rsk_prot = prot->rsk_prot;
3254
3255         if (!rsk_prot)
3256                 return 0;
3257
3258         rsk_prot->slab_name = kasprintf(GFP_KERNEL, "request_sock_%s",
3259                                         prot->name);
3260         if (!rsk_prot->slab_name)
3261                 return -ENOMEM;
3262
3263         rsk_prot->slab = kmem_cache_create(rsk_prot->slab_name,
3264                                            rsk_prot->obj_size, 0,
3265                                            prot->slab_flags, NULL);
3266
3267         if (!rsk_prot->slab) {
3268                 pr_crit("%s: Can't create request sock SLAB cache!\n",
3269                         prot->name);
3270                 return -ENOMEM;
3271         }
3272         return 0;
3273 }
3274
3275 int proto_register(struct proto *prot, int alloc_slab)
3276 {
3277         if (alloc_slab) {
3278                 prot->slab = kmem_cache_create_usercopy(prot->name,
3279                                         prot->obj_size, 0,
3280                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
3281                                         prot->useroffset, prot->usersize,
3282                                         NULL);
3283
3284                 if (prot->slab == NULL) {
3285                         pr_crit("%s: Can't create sock SLAB cache!\n",
3286                                 prot->name);
3287                         goto out;
3288                 }
3289
3290                 if (req_prot_init(prot))
3291                         goto out_free_request_sock_slab;
3292
3293                 if (prot->twsk_prot != NULL) {
3294                         prot->twsk_prot->twsk_slab_name = kasprintf(GFP_KERNEL, "tw_sock_%s", prot->name);
3295
3296                         if (prot->twsk_prot->twsk_slab_name == NULL)
3297                                 goto out_free_request_sock_slab;
3298
3299                         prot->twsk_prot->twsk_slab =
3300                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
3301                                                   prot->twsk_prot->twsk_obj_size,
3302                                                   0,
3303                                                   prot->slab_flags,
3304                                                   NULL);
3305                         if (prot->twsk_prot->twsk_slab == NULL)
3306                                 goto out_free_timewait_sock_slab_name;
3307                 }
3308         }
3309
3310         mutex_lock(&proto_list_mutex);
3311         list_add(&prot->node, &proto_list);
3312         assign_proto_idx(prot);
3313         mutex_unlock(&proto_list_mutex);
3314         return 0;
3315
3316 out_free_timewait_sock_slab_name:
3317         kfree(prot->twsk_prot->twsk_slab_name);
3318 out_free_request_sock_slab:
3319         req_prot_cleanup(prot->rsk_prot);
3320
3321         kmem_cache_destroy(prot->slab);
3322         prot->slab = NULL;
3323 out:
3324         return -ENOBUFS;
3325 }
3326 EXPORT_SYMBOL(proto_register);
3327
3328 void proto_unregister(struct proto *prot)
3329 {
3330         mutex_lock(&proto_list_mutex);
3331         release_proto_idx(prot);
3332         list_del(&prot->node);
3333         mutex_unlock(&proto_list_mutex);
3334
3335         kmem_cache_destroy(prot->slab);
3336         prot->slab = NULL;
3337
3338         req_prot_cleanup(prot->rsk_prot);
3339
3340         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
3341                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
3342                 kfree(prot->twsk_prot->twsk_slab_name);
3343                 prot->twsk_prot->twsk_slab = NULL;
3344         }
3345 }
3346 EXPORT_SYMBOL(proto_unregister);
3347
3348 int sock_load_diag_module(int family, int protocol)
3349 {
3350         if (!protocol) {
3351                 if (!sock_is_registered(family))
3352                         return -ENOENT;
3353
3354                 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
3355                                       NETLINK_SOCK_DIAG, family);
3356         }
3357
3358 #ifdef CONFIG_INET
3359         if (family == AF_INET &&
3360             !rcu_access_pointer(inet_protos[protocol]))
3361                 return -ENOENT;
3362 #endif
3363
3364         return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK,
3365                               NETLINK_SOCK_DIAG, family, protocol);
3366 }
3367 EXPORT_SYMBOL(sock_load_diag_module);
3368
3369 #ifdef CONFIG_PROC_FS
3370 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
3371         __acquires(proto_list_mutex)
3372 {
3373         mutex_lock(&proto_list_mutex);
3374         return seq_list_start_head(&proto_list, *pos);
3375 }
3376
3377 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3378 {
3379         return seq_list_next(v, &proto_list, pos);
3380 }
3381
3382 static void proto_seq_stop(struct seq_file *seq, void *v)
3383         __releases(proto_list_mutex)
3384 {
3385         mutex_unlock(&proto_list_mutex);
3386 }
3387
3388 static char proto_method_implemented(const void *method)
3389 {
3390         return method == NULL ? 'n' : 'y';
3391 }
3392 static long sock_prot_memory_allocated(struct proto *proto)
3393 {
3394         return proto->memory_allocated != NULL ? proto_memory_allocated(proto) : -1L;
3395 }
3396
3397 static char *sock_prot_memory_pressure(struct proto *proto)
3398 {
3399         return proto->memory_pressure != NULL ?
3400         proto_memory_pressure(proto) ? "yes" : "no" : "NI";
3401 }
3402
3403 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
3404 {
3405
3406         seq_printf(seq, "%-9s %4u %6d  %6ld   %-3s %6u   %-3s  %-10s "
3407                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
3408                    proto->name,
3409                    proto->obj_size,
3410                    sock_prot_inuse_get(seq_file_net(seq), proto),
3411                    sock_prot_memory_allocated(proto),
3412                    sock_prot_memory_pressure(proto),
3413                    proto->max_header,
3414                    proto->slab == NULL ? "no" : "yes",
3415                    module_name(proto->owner),
3416                    proto_method_implemented(proto->close),
3417                    proto_method_implemented(proto->connect),
3418                    proto_method_implemented(proto->disconnect),
3419                    proto_method_implemented(proto->accept),
3420                    proto_method_implemented(proto->ioctl),
3421                    proto_method_implemented(proto->init),
3422                    proto_method_implemented(proto->destroy),
3423                    proto_method_implemented(proto->shutdown),
3424                    proto_method_implemented(proto->setsockopt),
3425                    proto_method_implemented(proto->getsockopt),
3426                    proto_method_implemented(proto->sendmsg),
3427                    proto_method_implemented(proto->recvmsg),
3428                    proto_method_implemented(proto->sendpage),
3429                    proto_method_implemented(proto->bind),
3430                    proto_method_implemented(proto->backlog_rcv),
3431                    proto_method_implemented(proto->hash),
3432                    proto_method_implemented(proto->unhash),
3433                    proto_method_implemented(proto->get_port),
3434                    proto_method_implemented(proto->enter_memory_pressure));
3435 }
3436
3437 static int proto_seq_show(struct seq_file *seq, void *v)
3438 {
3439         if (v == &proto_list)
3440                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
3441                            "protocol",
3442                            "size",
3443                            "sockets",
3444                            "memory",
3445                            "press",
3446                            "maxhdr",
3447                            "slab",
3448                            "module",
3449                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
3450         else
3451                 proto_seq_printf(seq, list_entry(v, struct proto, node));
3452         return 0;
3453 }
3454
3455 static const struct seq_operations proto_seq_ops = {
3456         .start  = proto_seq_start,
3457         .next   = proto_seq_next,
3458         .stop   = proto_seq_stop,
3459         .show   = proto_seq_show,
3460 };
3461
3462 static int proto_seq_open(struct inode *inode, struct file *file)
3463 {
3464         return seq_open_net(inode, file, &proto_seq_ops,
3465                             sizeof(struct seq_net_private));
3466 }
3467
3468 static const struct file_operations proto_seq_fops = {
3469         .open           = proto_seq_open,
3470         .read           = seq_read,
3471         .llseek         = seq_lseek,
3472         .release        = seq_release_net,
3473 };
3474
3475 static __net_init int proto_init_net(struct net *net)
3476 {
3477         if (!proc_create("protocols", 0444, net->proc_net, &proto_seq_fops))
3478                 return -ENOMEM;
3479
3480         return 0;
3481 }
3482
3483 static __net_exit void proto_exit_net(struct net *net)
3484 {
3485         remove_proc_entry("protocols", net->proc_net);
3486 }
3487
3488
3489 static __net_initdata struct pernet_operations proto_net_ops = {
3490         .init = proto_init_net,
3491         .exit = proto_exit_net,
3492 };
3493
3494 static int __init proto_init(void)
3495 {
3496         return register_pernet_subsys(&proto_net_ops);
3497 }
3498
3499 subsys_initcall(proto_init);
3500
3501 #endif /* PROC_FS */
3502
3503 #ifdef CONFIG_NET_RX_BUSY_POLL
3504 bool sk_busy_loop_end(void *p, unsigned long start_time)
3505 {
3506         struct sock *sk = p;
3507
3508         return !skb_queue_empty(&sk->sk_receive_queue) ||
3509                sk_busy_loop_timeout(sk, start_time);
3510 }
3511 EXPORT_SYMBOL(sk_busy_loop_end);
3512 #endif /* CONFIG_NET_RX_BUSY_POLL */