net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <linux/bpf.h>
  98 #include <net/net_namespace.h>
  99 #include <net/sock.h>
 100 #include <net/busy_poll.h>
 101 #include <linux/rtnetlink.h>
 102 #include <linux/stat.h>
 103 #include <net/dst.h>
 104 #include <net/dst_metadata.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <net/xfrm.h>
 108 #include <linux/highmem.h>
 109 #include <linux/init.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/iw_handler.h>
 115 #include <asm/current.h>
 116 #include <linux/audit.h>
 117 #include <linux/dmaengine.h>
 118 #include <linux/err.h>
 119 #include <linux/ctype.h>
 120 #include <linux/if_arp.h>
 121 #include <linux/if_vlan.h>
 122 #include <linux/ip.h>
 123 #include <net/ip.h>
 124 #include <net/mpls.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129 #include <trace/events/napi.h>
 130 #include <trace/events/net.h>
 131 #include <trace/events/skb.h>
 132 #include <linux/pci.h>
 133 #include <linux/inetdevice.h>
 134 #include <linux/cpu_rmap.h>
 135 #include <linux/static_key.h>
 136 #include <linux/hashtable.h>
 137 #include <linux/vmalloc.h>
 138 #include <linux/if_macvlan.h>
 139 #include <linux/errqueue.h>
 140 #include <linux/hrtimer.h>
 141 #include <linux/netfilter_ingress.h>
 142 #include <linux/crash_dump.h>
 143
 144 #include "net-sysfs.h"
 145
 146 /* Instead of increasing this, you should create a hash table. */
 147 #define MAX_GRO_SKBS 8
 148
 149 /* This should be increased if a protocol with a bigger head is added. */
 150 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 151
 152 static DEFINE_SPINLOCK(ptype_lock);
 153 static DEFINE_SPINLOCK(offload_lock);
 154 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 155 struct list_head ptype_all __read_mostly;       /* Taps */
 156 static struct list_head offload_base __read_mostly;
 157
 158 static int netif_rx_internal(struct sk_buff *skb);
 159 static int call_netdevice_notifiers_info(unsigned long val,
 160                                          struct net_device *dev,
 161                                          struct netdev_notifier_info *info);
 162
 163 /*
 164  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 165  * semaphore.
 166  *
 167  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 168  *
 169  * Writers must hold the rtnl semaphore while they loop through the
 170  * dev_base_head list, and hold dev_base_lock for writing when they do the
 171  * actual updates.  This allows pure readers to access the list even
 172  * while a writer is preparing to update it.
 173  *
 174  * To put it another way, dev_base_lock is held for writing only to
 175  * protect against pure readers; the rtnl semaphore provides the
 176  * protection against other writers.
 177  *
 178  * See, for example usages, register_netdevice() and
 179  * unregister_netdevice(), which must be called with the rtnl
 180  * semaphore held.
 181  */
 182 DEFINE_RWLOCK(dev_base_lock);
 183 EXPORT_SYMBOL(dev_base_lock);
 184
 185 /* protects napi_hash addition/deletion and napi_gen_id */
 186 static DEFINE_SPINLOCK(napi_hash_lock);
 187
 188 static unsigned int napi_gen_id = NR_CPUS;
 189 static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
 190
 191 static seqcount_t devnet_rename_seq;
 192
 193 static inline void dev_base_seq_inc(struct net *net)
 194 {
 195         while (++net->dev_base_seq == 0);
 196 }
 197
 198 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 199 {
 200         unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
 201
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 static inline void rps_lock(struct softnet_data *sd)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_lock(&sd->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 static inline void rps_unlock(struct softnet_data *sd)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_unlock(&sd->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 /* Device list insertion */
 225 static void list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head_rcu(&dev->index_hlist,
 235                            dev_index_hash(net, dev->ifindex));
 236         write_unlock_bh(&dev_base_lock);
 237
 238         dev_base_seq_inc(net);
 239 }
 240
 241 /* Device list removal
 242  * caller must respect a RCU grace period before freeing/reusing dev
 243  */
 244 static void unlist_netdevice(struct net_device *dev)
 245 {
 246         ASSERT_RTNL();
 247
 248         /* Unlink dev from the device chain */
 249         write_lock_bh(&dev_base_lock);
 250         list_del_rcu(&dev->dev_list);
 251         hlist_del_rcu(&dev->name_hlist);
 252         hlist_del_rcu(&dev->index_hlist);
 253         write_unlock_bh(&dev_base_lock);
 254
 255         dev_base_seq_inc(dev_net(dev));
 256 }
 257
 258 /*
 259  *      Our notifier list
 260  */
 261
 262 static RAW_NOTIFIER_HEAD(netdev_chain);
 263
 264 /*
 265  *      Device drivers call our routines to queue packets here. We empty the
 266  *      queue in the local softnet handler.
 267  */
 268
 269 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 270 EXPORT_PER_CPU_SYMBOL(softnet_data);
 271
 272 #ifdef CONFIG_LOCKDEP
 273 /*
 274  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 275  * according to dev->type
 276  */
 277 static const unsigned short netdev_lock_type[] =
 278         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 279          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 280          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 281          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 282          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 283          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 284          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 285          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 286          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 287          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 288          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 289          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 290          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 291          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 292          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 293
 294 static const char *const netdev_lock_name[] =
 295         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 296          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 297          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 298          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 299          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 300          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 301          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 302          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 303          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 304          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 305          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 306          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 307          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 308          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 309          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 310
 311 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 313
 314 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 315 {
 316         int i;
 317
 318         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 319                 if (netdev_lock_type[i] == dev_type)
 320                         return i;
 321         /* the last key is used by default */
 322         return ARRAY_SIZE(netdev_lock_type) - 1;
 323 }
 324
 325 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 326                                                  unsigned short dev_type)
 327 {
 328         int i;
 329
 330         i = netdev_lock_pos(dev_type);
 331         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 332                                    netdev_lock_name[i]);
 333 }
 334
 335 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 336 {
 337         int i;
 338
 339         i = netdev_lock_pos(dev->type);
 340         lockdep_set_class_and_name(&dev->addr_list_lock,
 341                                    &netdev_addr_lock_key[i],
 342                                    netdev_lock_name[i]);
 343 }
 344 #else
 345 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 346                                                  unsigned short dev_type)
 347 {
 348 }
 349 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 350 {
 351 }
 352 #endif
 353
 354 /*******************************************************************************
 355
 356                 Protocol management and registration routines
 357
 358 *******************************************************************************/
 359
 360 /*
 361  *      Add a protocol ID to the list. Now that the input handler is
 362  *      smarter we can dispense with all the messy stuff that used to be
 363  *      here.
 364  *
 365  *      BEWARE!!! Protocol handlers, mangling input packets,
 366  *      MUST BE last in hash buckets and checking protocol handlers
 367  *      MUST start from promiscuous ptype_all chain in net_bh.
 368  *      It is true now, do not change it.
 369  *      Explanation follows: if protocol handler, mangling packet, will
 370  *      be the first on list, it is not able to sense, that packet
 371  *      is cloned and should be copied-on-write, so that it will
 372  *      change it and subsequent readers will get broken packet.
 373  *                                                      --ANK (980803)
 374  */
 375
 376 static inline struct list_head *ptype_head(const struct packet_type *pt)
 377 {
 378         if (pt->type == htons(ETH_P_ALL))
 379                 return pt->dev ? &pt->dev->ptype_all : &ptype_all;
 380         else
 381                 return pt->dev ? &pt->dev->ptype_specific :
 382                                  &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 383 }
 384
 385 /**
 386  *      dev_add_pack - add packet handler
 387  *      @pt: packet type declaration
 388  *
 389  *      Add a protocol handler to the networking stack. The passed &packet_type
 390  *      is linked into kernel lists and may not be freed until it has been
 391  *      removed from the kernel lists.
 392  *
 393  *      This call does not sleep therefore it can not
 394  *      guarantee all CPU's that are in middle of receiving packets
 395  *      will see the new packet type (until the next received packet).
 396  */
 397
 398 void dev_add_pack(struct packet_type *pt)
 399 {
 400         struct list_head *head = ptype_head(pt);
 401
 402         spin_lock(&ptype_lock);
 403         list_add_rcu(&pt->list, head);
 404         spin_unlock(&ptype_lock);
 405 }
 406 EXPORT_SYMBOL(dev_add_pack);
 407
 408 /**
 409  *      __dev_remove_pack        - remove packet handler
 410  *      @pt: packet type declaration
 411  *
 412  *      Remove a protocol handler that was previously added to the kernel
 413  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 414  *      from the kernel lists and can be freed or reused once this function
 415  *      returns.
 416  *
 417  *      The packet type might still be in use by receivers
 418  *      and must not be freed until after all the CPU's have gone
 419  *      through a quiescent state.
 420  */
 421 void __dev_remove_pack(struct packet_type *pt)
 422 {
 423         struct list_head *head = ptype_head(pt);
 424         struct packet_type *pt1;
 425
 426         spin_lock(&ptype_lock);
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         pr_warn("dev_remove_pack: %p not found\n", pt);
 436 out:
 437         spin_unlock(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461
 462 /**
 463  *      dev_add_offload - register offload handlers
 464  *      @po: protocol offload declaration
 465  *
 466  *      Add protocol offload handlers to the networking stack. The passed
 467  *      &proto_offload is linked into kernel lists and may not be freed until
 468  *      it has been removed from the kernel lists.
 469  *
 470  *      This call does not sleep therefore it can not
 471  *      guarantee all CPU's that are in middle of receiving packets
 472  *      will see the new offload handlers (until the next received packet).
 473  */
 474 void dev_add_offload(struct packet_offload *po)
 475 {
 476         struct packet_offload *elem;
 477
 478         spin_lock(&offload_lock);
 479         list_for_each_entry(elem, &offload_base, list) {
 480                 if (po->priority < elem->priority)
 481                         break;
 482         }
 483         list_add_rcu(&po->list, elem->list.prev);
 484         spin_unlock(&offload_lock);
 485 }
 486 EXPORT_SYMBOL(dev_add_offload);
 487
 488 /**
 489  *      __dev_remove_offload     - remove offload handler
 490  *      @po: packet offload declaration
 491  *
 492  *      Remove a protocol offload handler that was previously added to the
 493  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 494  *      is removed from the kernel lists and can be freed or reused once this
 495  *      function returns.
 496  *
 497  *      The packet type might still be in use by receivers
 498  *      and must not be freed until after all the CPU's have gone
 499  *      through a quiescent state.
 500  */
 501 static void __dev_remove_offload(struct packet_offload *po)
 502 {
 503         struct list_head *head = &offload_base;
 504         struct packet_offload *po1;
 505
 506         spin_lock(&offload_lock);
 507
 508         list_for_each_entry(po1, head, list) {
 509                 if (po == po1) {
 510                         list_del_rcu(&po->list);
 511                         goto out;
 512                 }
 513         }
 514
 515         pr_warn("dev_remove_offload: %p not found\n", po);
 516 out:
 517         spin_unlock(&offload_lock);
 518 }
 519
 520 /**
 521  *      dev_remove_offload       - remove packet offload handler
 522  *      @po: packet offload declaration
 523  *
 524  *      Remove a packet offload handler that was previously added to the kernel
 525  *      offload handlers by dev_add_offload(). The passed &offload_type is
 526  *      removed from the kernel lists and can be freed or reused once this
 527  *      function returns.
 528  *
 529  *      This call sleeps to guarantee that no CPU is looking at the packet
 530  *      type after return.
 531  */
 532 void dev_remove_offload(struct packet_offload *po)
 533 {
 534         __dev_remove_offload(po);
 535
 536         synchronize_net();
 537 }
 538 EXPORT_SYMBOL(dev_remove_offload);
 539
 540 /******************************************************************************
 541
 542                       Device Boot-time Settings Routines
 543
 544 *******************************************************************************/
 545
 546 /* Boot time configuration table */
 547 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 548
 549 /**
 550  *      netdev_boot_setup_add   - add new setup entry
 551  *      @name: name of the device
 552  *      @map: configured settings for the device
 553  *
 554  *      Adds new setup entry to the dev_boot_setup list.  The function
 555  *      returns 0 on error and 1 on success.  This is a generic routine to
 556  *      all netdevices.
 557  */
 558 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 559 {
 560         struct netdev_boot_setup *s;
 561         int i;
 562
 563         s = dev_boot_setup;
 564         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 565                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 566                         memset(s[i].name, 0, sizeof(s[i].name));
 567                         strlcpy(s[i].name, name, IFNAMSIZ);
 568                         memcpy(&s[i].map, map, sizeof(s[i].map));
 569                         break;
 570                 }
 571         }
 572
 573         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 574 }
 575
 576 /**
 577  *      netdev_boot_setup_check - check boot time settings
 578  *      @dev: the netdevice
 579  *
 580  *      Check boot time settings for the device.
 581  *      The found settings are set for the device to be used
 582  *      later in the device probing.
 583  *      Returns 0 if no settings found, 1 if they are.
 584  */
 585 int netdev_boot_setup_check(struct net_device *dev)
 586 {
 587         struct netdev_boot_setup *s = dev_boot_setup;
 588         int i;
 589
 590         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 591                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 592                     !strcmp(dev->name, s[i].name)) {
 593                         dev->irq        = s[i].map.irq;
 594                         dev->base_addr  = s[i].map.base_addr;
 595                         dev->mem_start  = s[i].map.mem_start;
 596                         dev->mem_end    = s[i].map.mem_end;
 597                         return 1;
 598                 }
 599         }
 600         return 0;
 601 }
 602 EXPORT_SYMBOL(netdev_boot_setup_check);
 603
 604
 605 /**
 606  *      netdev_boot_base        - get address from boot time settings
 607  *      @prefix: prefix for network device
 608  *      @unit: id for network device
 609  *
 610  *      Check boot time settings for the base address of device.
 611  *      The found settings are set for the device to be used
 612  *      later in the device probing.
 613  *      Returns 0 if no settings found.
 614  */
 615 unsigned long netdev_boot_base(const char *prefix, int unit)
 616 {
 617         const struct netdev_boot_setup *s = dev_boot_setup;
 618         char name[IFNAMSIZ];
 619         int i;
 620
 621         sprintf(name, "%s%d", prefix, unit);
 622
 623         /*
 624          * If device already registered then return base of 1
 625          * to indicate not to probe for this interface
 626          */
 627         if (__dev_get_by_name(&init_net, name))
 628                 return 1;
 629
 630         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 631                 if (!strcmp(name, s[i].name))
 632                         return s[i].map.base_addr;
 633         return 0;
 634 }
 635
 636 /*
 637  * Saves at boot time configured settings for any netdevice.
 638  */
 639 int __init netdev_boot_setup(char *str)
 640 {
 641         int ints[5];
 642         struct ifmap map;
 643
 644         str = get_options(str, ARRAY_SIZE(ints), ints);
 645         if (!str || !*str)
 646                 return 0;
 647
 648         /* Save settings */
 649         memset(&map, 0, sizeof(map));
 650         if (ints[0] > 0)
 651                 map.irq = ints[1];
 652         if (ints[0] > 1)
 653                 map.base_addr = ints[2];
 654         if (ints[0] > 2)
 655                 map.mem_start = ints[3];
 656         if (ints[0] > 3)
 657                 map.mem_end = ints[4];
 658
 659         /* Add new entry to the list */
 660         return netdev_boot_setup_add(str, &map);
 661 }
 662
 663 __setup("netdev=", netdev_boot_setup);
 664
 665 /*******************************************************************************
 666
 667                             Device Interface Subroutines
 668
 669 *******************************************************************************/
 670
 671 /**
 672  *      dev_get_iflink  - get 'iflink' value of a interface
 673  *      @dev: targeted interface
 674  *
 675  *      Indicates the ifindex the interface is linked to.
 676  *      Physical interfaces have the same 'ifindex' and 'iflink' values.
 677  */
 678
 679 int dev_get_iflink(const struct net_device *dev)
 680 {
 681         if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
 682                 return dev->netdev_ops->ndo_get_iflink(dev);
 683
 684         return dev->ifindex;
 685 }
 686 EXPORT_SYMBOL(dev_get_iflink);
 687
 688 /**
 689  *      dev_fill_metadata_dst - Retrieve tunnel egress information.
 690  *      @dev: targeted interface
 691  *      @skb: The packet.
 692  *
 693  *      For better visibility of tunnel traffic OVS needs to retrieve
 694  *      egress tunnel information for a packet. Following API allows
 695  *      user to get this info.
 696  */
 697 int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
 698 {
 699         struct ip_tunnel_info *info;
 700
 701         if (!dev->netdev_ops  || !dev->netdev_ops->ndo_fill_metadata_dst)
 702                 return -EINVAL;
 703
 704         info = skb_tunnel_info_unclone(skb);
 705         if (!info)
 706                 return -ENOMEM;
 707         if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
 708                 return -EINVAL;
 709
 710         return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
 711 }
 712 EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
 713
 714 /**
 715  *      __dev_get_by_name       - find a device by its name
 716  *      @net: the applicable net namespace
 717  *      @name: name to find
 718  *
 719  *      Find an interface by name. Must be called under RTNL semaphore
 720  *      or @dev_base_lock. If the name is found a pointer to the device
 721  *      is returned. If the name is not found then %NULL is returned. The
 722  *      reference counters are not incremented so the caller must be
 723  *      careful with locks.
 724  */
 725
 726 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 727 {
 728         struct net_device *dev;
 729         struct hlist_head *head = dev_name_hash(net, name);
 730
 731         hlist_for_each_entry(dev, head, name_hlist)
 732                 if (!strncmp(dev->name, name, IFNAMSIZ))
 733                         return dev;
 734
 735         return NULL;
 736 }
 737 EXPORT_SYMBOL(__dev_get_by_name);
 738
 739 /**
 740  *      dev_get_by_name_rcu     - find a device by its name
 741  *      @net: the applicable net namespace
 742  *      @name: name to find
 743  *
 744  *      Find an interface by name.
 745  *      If the name is found a pointer to the device is returned.
 746  *      If the name is not found then %NULL is returned.
 747  *      The reference counters are not incremented so the caller must be
 748  *      careful with locks. The caller must hold RCU lock.
 749  */
 750
 751 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 752 {
 753         struct net_device *dev;
 754         struct hlist_head *head = dev_name_hash(net, name);
 755
 756         hlist_for_each_entry_rcu(dev, head, name_hlist)
 757                 if (!strncmp(dev->name, name, IFNAMSIZ))
 758                         return dev;
 759
 760         return NULL;
 761 }
 762 EXPORT_SYMBOL(dev_get_by_name_rcu);
 763
 764 /**
 765  *      dev_get_by_name         - find a device by its name
 766  *      @net: the applicable net namespace
 767  *      @name: name to find
 768  *
 769  *      Find an interface by name. This can be called from any
 770  *      context and does its own locking. The returned handle has
 771  *      the usage count incremented and the caller must use dev_put() to
 772  *      release it when it is no longer needed. %NULL is returned if no
 773  *      matching device is found.
 774  */
 775
 776 struct net_device *dev_get_by_name(struct net *net, const char *name)
 777 {
 778         struct net_device *dev;
 779
 780         rcu_read_lock();
 781         dev = dev_get_by_name_rcu(net, name);
 782         if (dev)
 783                 dev_hold(dev);
 784         rcu_read_unlock();
 785         return dev;
 786 }
 787 EXPORT_SYMBOL(dev_get_by_name);
 788
 789 /**
 790  *      __dev_get_by_index - find a device by its ifindex
 791  *      @net: the applicable net namespace
 792  *      @ifindex: index of device
 793  *
 794  *      Search for an interface by index. Returns %NULL if the device
 795  *      is not found or a pointer to the device. The device has not
 796  *      had its reference counter increased so the caller must be careful
 797  *      about locking. The caller must hold either the RTNL semaphore
 798  *      or @dev_base_lock.
 799  */
 800
 801 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 802 {
 803         struct net_device *dev;
 804         struct hlist_head *head = dev_index_hash(net, ifindex);
 805
 806         hlist_for_each_entry(dev, head, index_hlist)
 807                 if (dev->ifindex == ifindex)
 808                         return dev;
 809
 810         return NULL;
 811 }
 812 EXPORT_SYMBOL(__dev_get_by_index);
 813
 814 /**
 815  *      dev_get_by_index_rcu - find a device by its ifindex
 816  *      @net: the applicable net namespace
 817  *      @ifindex: index of device
 818  *
 819  *      Search for an interface by index. Returns %NULL if the device
 820  *      is not found or a pointer to the device. The device has not
 821  *      had its reference counter increased so the caller must be careful
 822  *      about locking. The caller must hold RCU lock.
 823  */
 824
 825 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 826 {
 827         struct net_device *dev;
 828         struct hlist_head *head = dev_index_hash(net, ifindex);
 829
 830         hlist_for_each_entry_rcu(dev, head, index_hlist)
 831                 if (dev->ifindex == ifindex)
 832                         return dev;
 833
 834         return NULL;
 835 }
 836 EXPORT_SYMBOL(dev_get_by_index_rcu);
 837
 838
 839 /**
 840  *      dev_get_by_index - find a device by its ifindex
 841  *      @net: the applicable net namespace
 842  *      @ifindex: index of device
 843  *
 844  *      Search for an interface by index. Returns NULL if the device
 845  *      is not found or a pointer to the device. The device returned has
 846  *      had a reference added and the pointer is safe until the user calls
 847  *      dev_put to indicate they have finished with it.
 848  */
 849
 850 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 851 {
 852         struct net_device *dev;
 853
 854         rcu_read_lock();
 855         dev = dev_get_by_index_rcu(net, ifindex);
 856         if (dev)
 857                 dev_hold(dev);
 858         rcu_read_unlock();
 859         return dev;
 860 }
 861 EXPORT_SYMBOL(dev_get_by_index);
 862
 863 /**
 864  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 865  *      @net: network namespace
 866  *      @name: a pointer to the buffer where the name will be stored.
 867  *      @ifindex: the ifindex of the interface to get the name from.
 868  *
 869  *      The use of raw_seqcount_begin() and cond_resched() before
 870  *      retrying is required as we want to give the writers a chance
 871  *      to complete when CONFIG_PREEMPT is not set.
 872  */
 873 int netdev_get_name(struct net *net, char *name, int ifindex)
 874 {
 875         struct net_device *dev;
 876         unsigned int seq;
 877
 878 retry:
 879         seq = raw_seqcount_begin(&devnet_rename_seq);
 880         rcu_read_lock();
 881         dev = dev_get_by_index_rcu(net, ifindex);
 882         if (!dev) {
 883                 rcu_read_unlock();
 884                 return -ENODEV;
 885         }
 886
 887         strcpy(name, dev->name);
 888         rcu_read_unlock();
 889         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 890                 cond_resched();
 891                 goto retry;
 892         }
 893
 894         return 0;
 895 }
 896
 897 /**
 898  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 899  *      @net: the applicable net namespace
 900  *      @type: media type of device
 901  *      @ha: hardware address
 902  *
 903  *      Search for an interface by MAC address. Returns NULL if the device
 904  *      is not found or a pointer to the device.
 905  *      The caller must hold RCU or RTNL.
 906  *      The returned device has not had its ref count increased
 907  *      and the caller must therefore be careful about locking
 908  *
 909  */
 910
 911 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 912                                        const char *ha)
 913 {
 914         struct net_device *dev;
 915
 916         for_each_netdev_rcu(net, dev)
 917                 if (dev->type == type &&
 918                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 919                         return dev;
 920
 921         return NULL;
 922 }
 923 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 924
 925 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 926 {
 927         struct net_device *dev;
 928
 929         ASSERT_RTNL();
 930         for_each_netdev(net, dev)
 931                 if (dev->type == type)
 932                         return dev;
 933
 934         return NULL;
 935 }
 936 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 937
 938 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 939 {
 940         struct net_device *dev, *ret = NULL;
 941
 942         rcu_read_lock();
 943         for_each_netdev_rcu(net, dev)
 944                 if (dev->type == type) {
 945                         dev_hold(dev);
 946                         ret = dev;
 947                         break;
 948                 }
 949         rcu_read_unlock();
 950         return ret;
 951 }
 952 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 953
 954 /**
 955  *      __dev_get_by_flags - find any device with given flags
 956  *      @net: the applicable net namespace
 957  *      @if_flags: IFF_* values
 958  *      @mask: bitmask of bits in if_flags to check
 959  *
 960  *      Search for any interface with the given flags. Returns NULL if a device
 961  *      is not found or a pointer to the device. Must be called inside
 962  *      rtnl_lock(), and result refcount is unchanged.
 963  */
 964
 965 struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
 966                                       unsigned short mask)
 967 {
 968         struct net_device *dev, *ret;
 969
 970         ASSERT_RTNL();
 971
 972         ret = NULL;
 973         for_each_netdev(net, dev) {
 974                 if (((dev->flags ^ if_flags) & mask) == 0) {
 975                         ret = dev;
 976                         break;
 977                 }
 978         }
 979         return ret;
 980 }
 981 EXPORT_SYMBOL(__dev_get_by_flags);
 982
 983 /**
 984  *      dev_valid_name - check if name is okay for network device
 985  *      @name: name string
 986  *
 987  *      Network device names need to be valid file names to
 988  *      to allow sysfs to work.  We also disallow any kind of
 989  *      whitespace.
 990  */
 991 bool dev_valid_name(const char *name)
 992 {
 993         if (*name == '\0')
 994                 return false;
 995         if (strlen(name) >= IFNAMSIZ)
 996                 return false;
 997         if (!strcmp(name, ".") || !strcmp(name, ".."))
 998                 return false;
 999
1000         while (*name) {
1001                 if (*name == '/' || *name == ':' || isspace(*name))
1002                         return false;
1003                 name++;
1004         }
1005         return true;
1006 }
1007 EXPORT_SYMBOL(dev_valid_name);
1008
1009 /**
1010  *      __dev_alloc_name - allocate a name for a device
1011  *      @net: network namespace to allocate the device name in
1012  *      @name: name format string
1013  *      @buf:  scratch buffer and result name string
1014  *
1015  *      Passed a format string - eg "lt%d" it will try and find a suitable
1016  *      id. It scans list of devices to build up a free map, then chooses
1017  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1018  *      while allocating the name and adding the device in order to avoid
1019  *      duplicates.
1020  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021  *      Returns the number of the unit assigned or a negative errno code.
1022  */
1023
1024 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1025 {
1026         int i = 0;
1027         const char *p;
1028         const int max_netdevices = 8*PAGE_SIZE;
1029         unsigned long *inuse;
1030         struct net_device *d;
1031
1032         p = strnchr(name, IFNAMSIZ-1, '%');
1033         if (p) {
1034                 /*
1035                  * Verify the string as this thing may have come from
1036                  * the user.  There must be either one "%d" and no other "%"
1037                  * characters.
1038                  */
1039                 if (p[1] != 'd' || strchr(p + 2, '%'))
1040                         return -EINVAL;
1041
1042                 /* Use one page as a bit array of possible slots */
1043                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1044                 if (!inuse)
1045                         return -ENOMEM;
1046
1047                 for_each_netdev(net, d) {
1048                         if (!sscanf(d->name, name, &i))
1049                                 continue;
1050                         if (i < 0 || i >= max_netdevices)
1051                                 continue;
1052
1053                         /*  avoid cases where sscanf is not exact inverse of printf */
1054                         snprintf(buf, IFNAMSIZ, name, i);
1055                         if (!strncmp(buf, d->name, IFNAMSIZ))
1056                                 set_bit(i, inuse);
1057                 }
1058
1059                 i = find_first_zero_bit(inuse, max_netdevices);
1060                 free_page((unsigned long) inuse);
1061         }
1062
1063         if (buf != name)
1064                 snprintf(buf, IFNAMSIZ, name, i);
1065         if (!__dev_get_by_name(net, buf))
1066                 return i;
1067
1068         /* It is possible to run out of possible slots
1069          * when the name is long and there isn't enough space left
1070          * for the digits, or if all bits are used.
1071          */
1072         return -ENFILE;
1073 }
1074
1075 /**
1076  *      dev_alloc_name - allocate a name for a device
1077  *      @dev: device
1078  *      @name: name format string
1079  *
1080  *      Passed a format string - eg "lt%d" it will try and find a suitable
1081  *      id. It scans list of devices to build up a free map, then chooses
1082  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1083  *      while allocating the name and adding the device in order to avoid
1084  *      duplicates.
1085  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1086  *      Returns the number of the unit assigned or a negative errno code.
1087  */
1088
1089 int dev_alloc_name(struct net_device *dev, const char *name)
1090 {
1091         char buf[IFNAMSIZ];
1092         struct net *net;
1093         int ret;
1094
1095         BUG_ON(!dev_net(dev));
1096         net = dev_net(dev);
1097         ret = __dev_alloc_name(net, name, buf);
1098         if (ret >= 0)
1099                 strlcpy(dev->name, buf, IFNAMSIZ);
1100         return ret;
1101 }
1102 EXPORT_SYMBOL(dev_alloc_name);
1103
1104 static int dev_alloc_name_ns(struct net *net,
1105                              struct net_device *dev,
1106                              const char *name)
1107 {
1108         char buf[IFNAMSIZ];
1109         int ret;
1110
1111         ret = __dev_alloc_name(net, name, buf);
1112         if (ret >= 0)
1113                 strlcpy(dev->name, buf, IFNAMSIZ);
1114         return ret;
1115 }
1116
1117 static int dev_get_valid_name(struct net *net,
1118                               struct net_device *dev,
1119                               const char *name)
1120 {
1121         BUG_ON(!net);
1122
1123         if (!dev_valid_name(name))
1124                 return -EINVAL;
1125
1126         if (strchr(name, '%'))
1127                 return dev_alloc_name_ns(net, dev, name);
1128         else if (__dev_get_by_name(net, name))
1129                 return -EEXIST;
1130         else if (dev->name != name)
1131                 strlcpy(dev->name, name, IFNAMSIZ);
1132
1133         return 0;
1134 }
1135
1136 /**
1137  *      dev_change_name - change name of a device
1138  *      @dev: device
1139  *      @newname: name (or format string) must be at least IFNAMSIZ
1140  *
1141  *      Change name of a device, can pass format strings "eth%d".
1142  *      for wildcarding.
1143  */
1144 int dev_change_name(struct net_device *dev, const char *newname)
1145 {
1146         unsigned char old_assign_type;
1147         char oldname[IFNAMSIZ];
1148         int err = 0;
1149         int ret;
1150         struct net *net;
1151
1152         ASSERT_RTNL();
1153         BUG_ON(!dev_net(dev));
1154
1155         net = dev_net(dev);
1156         if (dev->flags & IFF_UP)
1157                 return -EBUSY;
1158
1159         write_seqcount_begin(&devnet_rename_seq);
1160
1161         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1162                 write_seqcount_end(&devnet_rename_seq);
1163                 return 0;
1164         }
1165
1166         memcpy(oldname, dev->name, IFNAMSIZ);
1167
1168         err = dev_get_valid_name(net, dev, newname);
1169         if (err < 0) {
1170                 write_seqcount_end(&devnet_rename_seq);
1171                 return err;
1172         }
1173
1174         if (oldname[0] && !strchr(oldname, '%'))
1175                 netdev_info(dev, "renamed from %s\n", oldname);
1176
1177         old_assign_type = dev->name_assign_type;
1178         dev->name_assign_type = NET_NAME_RENAMED;
1179
1180 rollback:
1181         ret = device_rename(&dev->dev, dev->name);
1182         if (ret) {
1183                 memcpy(dev->name, oldname, IFNAMSIZ);
1184                 dev->name_assign_type = old_assign_type;
1185                 write_seqcount_end(&devnet_rename_seq);
1186                 return ret;
1187         }
1188
1189         write_seqcount_end(&devnet_rename_seq);
1190
1191         netdev_adjacent_rename_links(dev, oldname);
1192
1193         write_lock_bh(&dev_base_lock);
1194         hlist_del_rcu(&dev->name_hlist);
1195         write_unlock_bh(&dev_base_lock);
1196
1197         synchronize_rcu();
1198
1199         write_lock_bh(&dev_base_lock);
1200         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1201         write_unlock_bh(&dev_base_lock);
1202
1203         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1204         ret = notifier_to_errno(ret);
1205
1206         if (ret) {
1207                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1208                 if (err >= 0) {
1209                         err = ret;
1210                         write_seqcount_begin(&devnet_rename_seq);
1211                         memcpy(dev->name, oldname, IFNAMSIZ);
1212                         memcpy(oldname, newname, IFNAMSIZ);
1213                         dev->name_assign_type = old_assign_type;
1214                         old_assign_type = NET_NAME_RENAMED;
1215                         goto rollback;
1216                 } else {
1217                         pr_err("%s: name change rollback failed: %d\n",
1218                                dev->name, ret);
1219                 }
1220         }
1221
1222         return err;
1223 }
1224
1225 /**
1226  *      dev_set_alias - change ifalias of a device
1227  *      @dev: device
1228  *      @alias: name up to IFALIASZ
1229  *      @len: limit of bytes to copy from info
1230  *
1231  *      Set ifalias for a device,
1232  */
1233 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1234 {
1235         char *new_ifalias;
1236
1237         ASSERT_RTNL();
1238
1239         if (len >= IFALIASZ)
1240                 return -EINVAL;
1241
1242         if (!len) {
1243                 kfree(dev->ifalias);
1244                 dev->ifalias = NULL;
1245                 return 0;
1246         }
1247
1248         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1249         if (!new_ifalias)
1250                 return -ENOMEM;
1251         dev->ifalias = new_ifalias;
1252
1253         strlcpy(dev->ifalias, alias, len+1);
1254         return len;
1255 }
1256
1257
1258 /**
1259  *      netdev_features_change - device changes features
1260  *      @dev: device to cause notification
1261  *
1262  *      Called to indicate a device has changed features.
1263  */
1264 void netdev_features_change(struct net_device *dev)
1265 {
1266         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1267 }
1268 EXPORT_SYMBOL(netdev_features_change);
1269
1270 /**
1271  *      netdev_state_change - device changes state
1272  *      @dev: device to cause notification
1273  *
1274  *      Called to indicate a device has changed state. This function calls
1275  *      the notifier chains for netdev_chain and sends a NEWLINK message
1276  *      to the routing socket.
1277  */
1278 void netdev_state_change(struct net_device *dev)
1279 {
1280         if (dev->flags & IFF_UP) {
1281                 struct netdev_notifier_change_info change_info;
1282
1283                 change_info.flags_changed = 0;
1284                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
1285                                               &change_info.info);
1286                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1287         }
1288 }
1289 EXPORT_SYMBOL(netdev_state_change);
1290
1291 /**
1292  *      netdev_notify_peers - notify network peers about existence of @dev
1293  *      @dev: network device
1294  *
1295  * Generate traffic such that interested network peers are aware of
1296  * @dev, such as by generating a gratuitous ARP. This may be used when
1297  * a device wants to inform the rest of the network about some sort of
1298  * reconfiguration such as a failover event or virtual machine
1299  * migration.
1300  */
1301 void netdev_notify_peers(struct net_device *dev)
1302 {
1303         rtnl_lock();
1304         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1305         rtnl_unlock();
1306 }
1307 EXPORT_SYMBOL(netdev_notify_peers);
1308
1309 static int __dev_open(struct net_device *dev)
1310 {
1311         const struct net_device_ops *ops = dev->netdev_ops;
1312         int ret;
1313
1314         ASSERT_RTNL();
1315
1316         if (!netif_device_present(dev))
1317                 return -ENODEV;
1318
1319         /* Block netpoll from trying to do any rx path servicing.
1320          * If we don't do this there is a chance ndo_poll_controller
1321          * or ndo_poll may be running while we open the device
1322          */
1323         netpoll_poll_disable(dev);
1324
1325         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1326         ret = notifier_to_errno(ret);
1327         if (ret)
1328                 return ret;
1329
1330         set_bit(__LINK_STATE_START, &dev->state);
1331
1332         if (ops->ndo_validate_addr)
1333                 ret = ops->ndo_validate_addr(dev);
1334
1335         if (!ret && ops->ndo_open)
1336                 ret = ops->ndo_open(dev);
1337
1338         netpoll_poll_enable(dev);
1339
1340         if (ret)
1341                 clear_bit(__LINK_STATE_START, &dev->state);
1342         else {
1343                 dev->flags |= IFF_UP;
1344                 dev_set_rx_mode(dev);
1345                 dev_activate(dev);
1346                 add_device_randomness(dev->dev_addr, dev->addr_len);
1347         }
1348
1349         return ret;
1350 }
1351
1352 /**
1353  *      dev_open        - prepare an interface for use.
1354  *      @dev:   device to open
1355  *
1356  *      Takes a device from down to up state. The device's private open
1357  *      function is invoked and then the multicast lists are loaded. Finally
1358  *      the device is moved into the up state and a %NETDEV_UP message is
1359  *      sent to the netdev notifier chain.
1360  *
1361  *      Calling this function on an active interface is a nop. On a failure
1362  *      a negative errno code is returned.
1363  */
1364 int dev_open(struct net_device *dev)
1365 {
1366         int ret;
1367
1368         if (dev->flags & IFF_UP)
1369                 return 0;
1370
1371         ret = __dev_open(dev);
1372         if (ret < 0)
1373                 return ret;
1374
1375         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1376         call_netdevice_notifiers(NETDEV_UP, dev);
1377
1378         return ret;
1379 }
1380 EXPORT_SYMBOL(dev_open);
1381
1382 static int __dev_close_many(struct list_head *head)
1383 {
1384         struct net_device *dev;
1385
1386         ASSERT_RTNL();
1387         might_sleep();
1388
1389         list_for_each_entry(dev, head, close_list) {
1390                 /* Temporarily disable netpoll until the interface is down */
1391                 netpoll_poll_disable(dev);
1392
1393                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1394
1395                 clear_bit(__LINK_STATE_START, &dev->state);
1396
1397                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1398                  * can be even on different cpu. So just clear netif_running().
1399                  *
1400                  * dev->stop() will invoke napi_disable() on all of it's
1401                  * napi_struct instances on this device.
1402                  */
1403                 smp_mb__after_atomic(); /* Commit netif_running(). */
1404         }
1405
1406         dev_deactivate_many(head);
1407
1408         list_for_each_entry(dev, head, close_list) {
1409                 const struct net_device_ops *ops = dev->netdev_ops;
1410
1411                 /*
1412                  *      Call the device specific close. This cannot fail.
1413                  *      Only if device is UP
1414                  *
1415                  *      We allow it to be called even after a DETACH hot-plug
1416                  *      event.
1417                  */
1418                 if (ops->ndo_stop)
1419                         ops->ndo_stop(dev);
1420
1421                 dev->flags &= ~IFF_UP;
1422                 netpoll_poll_enable(dev);
1423         }
1424
1425         return 0;
1426 }
1427
1428 static int __dev_close(struct net_device *dev)
1429 {
1430         int retval;
1431         LIST_HEAD(single);
1432
1433         list_add(&dev->close_list, &single);
1434         retval = __dev_close_many(&single);
1435         list_del(&single);
1436
1437         return retval;
1438 }
1439
1440 int dev_close_many(struct list_head *head, bool unlink)
1441 {
1442         struct net_device *dev, *tmp;
1443
1444         /* Remove the devices that don't need to be closed */
1445         list_for_each_entry_safe(dev, tmp, head, close_list)
1446                 if (!(dev->flags & IFF_UP))
1447                         list_del_init(&dev->close_list);
1448
1449         __dev_close_many(head);
1450
1451         list_for_each_entry_safe(dev, tmp, head, close_list) {
1452                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1453                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1454                 if (unlink)
1455                         list_del_init(&dev->close_list);
1456         }
1457
1458         return 0;
1459 }
1460 EXPORT_SYMBOL(dev_close_many);
1461
1462 /**
1463  *      dev_close - shutdown an interface.
1464  *      @dev: device to shutdown
1465  *
1466  *      This function moves an active device into down state. A
1467  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1468  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1469  *      chain.
1470  */
1471 int dev_close(struct net_device *dev)
1472 {
1473         if (dev->flags & IFF_UP) {
1474                 LIST_HEAD(single);
1475
1476                 list_add(&dev->close_list, &single);
1477                 dev_close_many(&single, true);
1478                 list_del(&single);
1479         }
1480         return 0;
1481 }
1482 EXPORT_SYMBOL(dev_close);
1483
1484
1485 /**
1486  *      dev_disable_lro - disable Large Receive Offload on a device
1487  *      @dev: device
1488  *
1489  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1490  *      called under RTNL.  This is needed if received packets may be
1491  *      forwarded to another interface.
1492  */
1493 void dev_disable_lro(struct net_device *dev)
1494 {
1495         struct net_device *lower_dev;
1496         struct list_head *iter;
1497
1498         dev->wanted_features &= ~NETIF_F_LRO;
1499         netdev_update_features(dev);
1500
1501         if (unlikely(dev->features & NETIF_F_LRO))
1502                 netdev_WARN(dev, "failed to disable LRO!\n");
1503
1504         netdev_for_each_lower_dev(dev, lower_dev, iter)
1505                 dev_disable_lro(lower_dev);
1506 }
1507 EXPORT_SYMBOL(dev_disable_lro);
1508
1509 static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1510                                    struct net_device *dev)
1511 {
1512         struct netdev_notifier_info info;
1513
1514         netdev_notifier_info_init(&info, dev);
1515         return nb->notifier_call(nb, val, &info);
1516 }
1517
1518 static int dev_boot_phase = 1;
1519
1520 /**
1521  *      register_netdevice_notifier - register a network notifier block
1522  *      @nb: notifier
1523  *
1524  *      Register a notifier to be called when network device events occur.
1525  *      The notifier passed is linked into the kernel structures and must
1526  *      not be reused until it has been unregistered. A negative errno code
1527  *      is returned on a failure.
1528  *
1529  *      When registered all registration and up events are replayed
1530  *      to the new notifier to allow device to have a race free
1531  *      view of the network device list.
1532  */
1533
1534 int register_netdevice_notifier(struct notifier_block *nb)
1535 {
1536         struct net_device *dev;
1537         struct net_device *last;
1538         struct net *net;
1539         int err;
1540
1541         rtnl_lock();
1542         err = raw_notifier_chain_register(&netdev_chain, nb);
1543         if (err)
1544                 goto unlock;
1545         if (dev_boot_phase)
1546                 goto unlock;
1547         for_each_net(net) {
1548                 for_each_netdev(net, dev) {
1549                         err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1550                         err = notifier_to_errno(err);
1551                         if (err)
1552                                 goto rollback;
1553
1554                         if (!(dev->flags & IFF_UP))
1555                                 continue;
1556
1557                         call_netdevice_notifier(nb, NETDEV_UP, dev);
1558                 }
1559         }
1560
1561 unlock:
1562         rtnl_unlock();
1563         return err;
1564
1565 rollback:
1566         last = dev;
1567         for_each_net(net) {
1568                 for_each_netdev(net, dev) {
1569                         if (dev == last)
1570                                 goto outroll;
1571
1572                         if (dev->flags & IFF_UP) {
1573                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1574                                                         dev);
1575                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1576                         }
1577                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1578                 }
1579         }
1580
1581 outroll:
1582         raw_notifier_chain_unregister(&netdev_chain, nb);
1583         goto unlock;
1584 }
1585 EXPORT_SYMBOL(register_netdevice_notifier);
1586
1587 /**
1588  *      unregister_netdevice_notifier - unregister a network notifier block
1589  *      @nb: notifier
1590  *
1591  *      Unregister a notifier previously registered by
1592  *      register_netdevice_notifier(). The notifier is unlinked into the
1593  *      kernel structures and may then be reused. A negative errno code
1594  *      is returned on a failure.
1595  *
1596  *      After unregistering unregister and down device events are synthesized
1597  *      for all devices on the device list to the removed notifier to remove
1598  *      the need for special case cleanup code.
1599  */
1600
1601 int unregister_netdevice_notifier(struct notifier_block *nb)
1602 {
1603         struct net_device *dev;
1604         struct net *net;
1605         int err;
1606
1607         rtnl_lock();
1608         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1609         if (err)
1610                 goto unlock;
1611
1612         for_each_net(net) {
1613                 for_each_netdev(net, dev) {
1614                         if (dev->flags & IFF_UP) {
1615                                 call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1616                                                         dev);
1617                                 call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1618                         }
1619                         call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1620                 }
1621         }
1622 unlock:
1623         rtnl_unlock();
1624         return err;
1625 }
1626 EXPORT_SYMBOL(unregister_netdevice_notifier);
1627
1628 /**
1629  *      call_netdevice_notifiers_info - call all network notifier blocks
1630  *      @val: value passed unmodified to notifier function
1631  *      @dev: net_device pointer passed unmodified to notifier function
1632  *      @info: notifier information data
1633  *
1634  *      Call all network notifier blocks.  Parameters and return value
1635  *      are as for raw_notifier_call_chain().
1636  */
1637
1638 static int call_netdevice_notifiers_info(unsigned long val,
1639                                          struct net_device *dev,
1640                                          struct netdev_notifier_info *info)
1641 {
1642         ASSERT_RTNL();
1643         netdev_notifier_info_init(info, dev);
1644         return raw_notifier_call_chain(&netdev_chain, val, info);
1645 }
1646
1647 /**
1648  *      call_netdevice_notifiers - call all network notifier blocks
1649  *      @val: value passed unmodified to notifier function
1650  *      @dev: net_device pointer passed unmodified to notifier function
1651  *
1652  *      Call all network notifier blocks.  Parameters and return value
1653  *      are as for raw_notifier_call_chain().
1654  */
1655
1656 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1657 {
1658         struct netdev_notifier_info info;
1659
1660         return call_netdevice_notifiers_info(val, dev, &info);
1661 }
1662 EXPORT_SYMBOL(call_netdevice_notifiers);
1663
1664 #ifdef CONFIG_NET_INGRESS
1665 static struct static_key ingress_needed __read_mostly;
1666
1667 void net_inc_ingress_queue(void)
1668 {
1669         static_key_slow_inc(&ingress_needed);
1670 }
1671 EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
1672
1673 void net_dec_ingress_queue(void)
1674 {
1675         static_key_slow_dec(&ingress_needed);
1676 }
1677 EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
1678 #endif
1679
1680 #ifdef CONFIG_NET_EGRESS
1681 static struct static_key egress_needed __read_mostly;
1682
1683 void net_inc_egress_queue(void)
1684 {
1685         static_key_slow_inc(&egress_needed);
1686 }
1687 EXPORT_SYMBOL_GPL(net_inc_egress_queue);
1688
1689 void net_dec_egress_queue(void)
1690 {
1691         static_key_slow_dec(&egress_needed);
1692 }
1693 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
1694 #endif
1695
1696 static struct static_key netstamp_needed __read_mostly;
1697 #ifdef HAVE_JUMP_LABEL
1698 /* We are not allowed to call static_key_slow_dec() from irq context
1699  * If net_disable_timestamp() is called from irq context, defer the
1700  * static_key_slow_dec() calls.
1701  */
1702 static atomic_t netstamp_needed_deferred;
1703 #endif
1704
1705 void net_enable_timestamp(void)
1706 {
1707 #ifdef HAVE_JUMP_LABEL
1708         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1709
1710         if (deferred) {
1711                 while (--deferred)
1712                         static_key_slow_dec(&netstamp_needed);
1713                 return;
1714         }
1715 #endif
1716         static_key_slow_inc(&netstamp_needed);
1717 }
1718 EXPORT_SYMBOL(net_enable_timestamp);
1719
1720 void net_disable_timestamp(void)
1721 {
1722 #ifdef HAVE_JUMP_LABEL
1723         if (in_interrupt()) {
1724                 atomic_inc(&netstamp_needed_deferred);
1725                 return;
1726         }
1727 #endif
1728         static_key_slow_dec(&netstamp_needed);
1729 }
1730 EXPORT_SYMBOL(net_disable_timestamp);
1731
1732 static inline void net_timestamp_set(struct sk_buff *skb)
1733 {
1734         skb->tstamp.tv64 = 0;
1735         if (static_key_false(&netstamp_needed))
1736                 __net_timestamp(skb);
1737 }
1738
1739 #define net_timestamp_check(COND, SKB)                  \
1740         if (static_key_false(&netstamp_needed)) {               \
1741                 if ((COND) && !(SKB)->tstamp.tv64)      \
1742                         __net_timestamp(SKB);           \
1743         }                                               \
1744
1745 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
1746 {
1747         unsigned int len;
1748
1749         if (!(dev->flags & IFF_UP))
1750                 return false;
1751
1752         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1753         if (skb->len <= len)
1754                 return true;
1755
1756         /* if TSO is enabled, we don't care about the length as the packet
1757          * could be forwarded without being segmented before
1758          */
1759         if (skb_is_gso(skb))
1760                 return true;
1761
1762         return false;
1763 }
1764 EXPORT_SYMBOL_GPL(is_skb_forwardable);
1765
1766 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1767 {
1768         if (skb_orphan_frags(skb, GFP_ATOMIC) ||
1769             unlikely(!is_skb_forwardable(dev, skb))) {
1770                 atomic_long_inc(&dev->rx_dropped);
1771                 kfree_skb(skb);
1772                 return NET_RX_DROP;
1773         }
1774
1775         skb_scrub_packet(skb, true);
1776         skb->priority = 0;
1777         skb->protocol = eth_type_trans(skb, dev);
1778         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
1779
1780         return 0;
1781 }
1782 EXPORT_SYMBOL_GPL(__dev_forward_skb);
1783
1784 /**
1785  * dev_forward_skb - loopback an skb to another netif
1786  *
1787  * @dev: destination network device
1788  * @skb: buffer to forward
1789  *
1790  * return values:
1791  *      NET_RX_SUCCESS  (no congestion)
1792  *      NET_RX_DROP     (packet was dropped, but freed)
1793  *
1794  * dev_forward_skb can be used for injecting an skb from the
1795  * start_xmit function of one device into the receive queue
1796  * of another device.
1797  *
1798  * The receiving device may be in another namespace, so
1799  * we have to clear all information in the skb that could
1800  * impact namespace isolation.
1801  */
1802 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1803 {
1804         return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
1805 }
1806 EXPORT_SYMBOL_GPL(dev_forward_skb);
1807
1808 static inline int deliver_skb(struct sk_buff *skb,
1809                               struct packet_type *pt_prev,
1810                               struct net_device *orig_dev)
1811 {
1812         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1813                 return -ENOMEM;
1814         atomic_inc(&skb->users);
1815         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1816 }
1817
1818 static inline void deliver_ptype_list_skb(struct sk_buff *skb,
1819                                           struct packet_type **pt,
1820                                           struct net_device *orig_dev,
1821                                           __be16 type,
1822                                           struct list_head *ptype_list)
1823 {
1824         struct packet_type *ptype, *pt_prev = *pt;
1825
1826         list_for_each_entry_rcu(ptype, ptype_list, list) {
1827                 if (ptype->type != type)
1828                         continue;
1829                 if (pt_prev)
1830                         deliver_skb(skb, pt_prev, orig_dev);
1831                 pt_prev = ptype;
1832         }
1833         *pt = pt_prev;
1834 }
1835
1836 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1837 {
1838         if (!ptype->af_packet_priv || !skb->sk)
1839                 return false;
1840
1841         if (ptype->id_match)
1842                 return ptype->id_match(ptype, skb->sk);
1843         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1844                 return true;
1845
1846         return false;
1847 }
1848
1849 /*
1850  *      Support routine. Sends outgoing frames to any network
1851  *      taps currently in use.
1852  */
1853
1854 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1855 {
1856         struct packet_type *ptype;
1857         struct sk_buff *skb2 = NULL;
1858         struct packet_type *pt_prev = NULL;
1859         struct list_head *ptype_list = &ptype_all;
1860
1861         rcu_read_lock();
1862 again:
1863         list_for_each_entry_rcu(ptype, ptype_list, list) {
1864                 /* Never send packets back to the socket
1865                  * they originated from - MvS (miquels@drinkel.ow.org)
1866                  */
1867                 if (skb_loop_sk(ptype, skb))
1868                         continue;
1869
1870                 if (pt_prev) {
1871                         deliver_skb(skb2, pt_prev, skb->dev);
1872                         pt_prev = ptype;
1873                         continue;
1874                 }
1875
1876                 /* need to clone skb, done only once */
1877                 skb2 = skb_clone(skb, GFP_ATOMIC);
1878                 if (!skb2)
1879                         goto out_unlock;
1880
1881                 net_timestamp_set(skb2);
1882
1883                 /* skb->nh should be correctly
1884                  * set by sender, so that the second statement is
1885                  * just protection against buggy protocols.
1886                  */
1887                 skb_reset_mac_header(skb2);
1888
1889                 if (skb_network_header(skb2) < skb2->data ||
1890                     skb_network_header(skb2) > skb_tail_pointer(skb2)) {
1891                         net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1892                                              ntohs(skb2->protocol),
1893                                              dev->name);
1894                         skb_reset_network_header(skb2);
1895                 }
1896
1897                 skb2->transport_header = skb2->network_header;
1898                 skb2->pkt_type = PACKET_OUTGOING;
1899                 pt_prev = ptype;
1900         }
1901
1902         if (ptype_list == &ptype_all) {
1903                 ptype_list = &dev->ptype_all;
1904                 goto again;
1905         }
1906 out_unlock:
1907         if (pt_prev)
1908                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1909         rcu_read_unlock();
1910 }
1911 EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
1912
1913 /**
1914  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1915  * @dev: Network device
1916  * @txq: number of queues available
1917  *
1918  * If real_num_tx_queues is changed the tc mappings may no longer be
1919  * valid. To resolve this verify the tc mapping remains valid and if
1920  * not NULL the mapping. With no priorities mapping to this
1921  * offset/count pair it will no longer be used. In the worst case TC0
1922  * is invalid nothing can be done so disable priority mappings. If is
1923  * expected that drivers will fix this mapping if they can before
1924  * calling netif_set_real_num_tx_queues.
1925  */
1926 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1927 {
1928         int i;
1929         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1930
1931         /* If TC0 is invalidated disable TC mapping */
1932         if (tc->offset + tc->count > txq) {
1933                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1934                 dev->num_tc = 0;
1935                 return;
1936         }
1937
1938         /* Invalidated prio to tc mappings set to TC0 */
1939         for (i = 1; i < TC_BITMASK + 1; i++) {
1940                 int q = netdev_get_prio_tc_map(dev, i);
1941
1942                 tc = &dev->tc_to_txq[q];
1943                 if (tc->offset + tc->count > txq) {
1944                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1945                                 i, q);
1946                         netdev_set_prio_tc_map(dev, i, 0);
1947                 }
1948         }
1949 }
1950
1951 #ifdef CONFIG_XPS
1952 static DEFINE_MUTEX(xps_map_mutex);
1953 #define xmap_dereference(P)             \
1954         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1955
1956 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1957                                         int cpu, u16 index)
1958 {
1959         struct xps_map *map = NULL;
1960         int pos;
1961
1962         if (dev_maps)
1963                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1964
1965         for (pos = 0; map && pos < map->len; pos++) {
1966                 if (map->queues[pos] == index) {
1967                         if (map->len > 1) {
1968                                 map->queues[pos] = map->queues[--map->len];
1969                         } else {
1970                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1971                                 kfree_rcu(map, rcu);
1972                                 map = NULL;
1973                         }
1974                         break;
1975                 }
1976         }
1977
1978         return map;
1979 }
1980
1981 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1982 {
1983         struct xps_dev_maps *dev_maps;
1984         int cpu, i;
1985         bool active = false;
1986
1987         mutex_lock(&xps_map_mutex);
1988         dev_maps = xmap_dereference(dev->xps_maps);
1989
1990         if (!dev_maps)
1991                 goto out_no_maps;
1992
1993         for_each_possible_cpu(cpu) {
1994                 for (i = index; i < dev->num_tx_queues; i++) {
1995                         if (!remove_xps_queue(dev_maps, cpu, i))
1996                                 break;
1997                 }
1998                 if (i == dev->num_tx_queues)
1999                         active = true;
2000         }
2001
2002         if (!active) {
2003                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2004                 kfree_rcu(dev_maps, rcu);
2005         }
2006
2007         for (i = index; i < dev->num_tx_queues; i++)
2008                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
2009                                              NUMA_NO_NODE);
2010
2011 out_no_maps:
2012         mutex_unlock(&xps_map_mutex);
2013 }
2014
2015 static struct xps_map *expand_xps_map(struct xps_map *map,
2016                                       int cpu, u16 index)
2017 {
2018         struct xps_map *new_map;
2019         int alloc_len = XPS_MIN_MAP_ALLOC;
2020         int i, pos;
2021
2022         for (pos = 0; map && pos < map->len; pos++) {
2023                 if (map->queues[pos] != index)
2024                         continue;
2025                 return map;
2026         }
2027
2028         /* Need to add queue to this CPU's existing map */
2029         if (map) {
2030                 if (pos < map->alloc_len)
2031                         return map;
2032
2033                 alloc_len = map->alloc_len * 2;
2034         }
2035
2036         /* Need to allocate new map to store queue on this CPU's map */
2037         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2038                                cpu_to_node(cpu));
2039         if (!new_map)
2040                 return NULL;
2041
2042         for (i = 0; i < pos; i++)
2043                 new_map->queues[i] = map->queues[i];
2044         new_map->alloc_len = alloc_len;
2045         new_map->len = pos;
2046
2047         return new_map;
2048 }
2049
2050 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2051                         u16 index)
2052 {
2053         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2054         struct xps_map *map, *new_map;
2055         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
2056         int cpu, numa_node_id = -2;
2057         bool active = false;
2058
2059         mutex_lock(&xps_map_mutex);
2060
2061         dev_maps = xmap_dereference(dev->xps_maps);
2062
2063         /* allocate memory for queue storage */
2064         for_each_online_cpu(cpu) {
2065                 if (!cpumask_test_cpu(cpu, mask))
2066                         continue;
2067
2068                 if (!new_dev_maps)
2069                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2070                 if (!new_dev_maps) {
2071                         mutex_unlock(&xps_map_mutex);
2072                         return -ENOMEM;
2073                 }
2074
2075                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2076                                  NULL;
2077
2078                 map = expand_xps_map(map, cpu, index);
2079                 if (!map)
2080                         goto error;
2081
2082                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2083         }
2084
2085         if (!new_dev_maps)
2086                 goto out_no_new_maps;
2087
2088         for_each_possible_cpu(cpu) {
2089                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
2090                         /* add queue to CPU maps */
2091                         int pos = 0;
2092
2093                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2094                         while ((pos < map->len) && (map->queues[pos] != index))
2095                                 pos++;
2096
2097                         if (pos == map->len)
2098                                 map->queues[map->len++] = index;
2099 #ifdef CONFIG_NUMA
2100                         if (numa_node_id == -2)
2101                                 numa_node_id = cpu_to_node(cpu);
2102                         else if (numa_node_id != cpu_to_node(cpu))
2103                                 numa_node_id = -1;
2104 #endif
2105                 } else if (dev_maps) {
2106                         /* fill in the new device map from the old device map */
2107                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2108                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
2109                 }
2110
2111         }
2112
2113         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
2114
2115         /* Cleanup old maps */
2116         if (dev_maps) {
2117                 for_each_possible_cpu(cpu) {
2118                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2119                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
2120                         if (map && map != new_map)
2121                                 kfree_rcu(map, rcu);
2122                 }
2123
2124                 kfree_rcu(dev_maps, rcu);
2125         }
2126
2127         dev_maps = new_dev_maps;
2128         active = true;
2129
2130 out_no_new_maps:
2131         /* update Tx queue numa node */
2132         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2133                                      (numa_node_id >= 0) ? numa_node_id :
2134                                      NUMA_NO_NODE);
2135
2136         if (!dev_maps)
2137                 goto out_no_maps;
2138
2139         /* removes queue from unused CPUs */
2140         for_each_possible_cpu(cpu) {
2141                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2142                         continue;
2143
2144                 if (remove_xps_queue(dev_maps, cpu, index))
2145                         active = true;
2146         }
2147
2148         /* free map if not active */
2149         if (!active) {
2150                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2151                 kfree_rcu(dev_maps, rcu);
2152         }
2153
2154 out_no_maps:
2155         mutex_unlock(&xps_map_mutex);
2156
2157         return 0;
2158 error:
2159         /* remove any maps that we added */
2160         for_each_possible_cpu(cpu) {
2161                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2162                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2163                                  NULL;
2164                 if (new_map && new_map != map)
2165                         kfree(new_map);
2166         }
2167
2168         mutex_unlock(&xps_map_mutex);
2169
2170         kfree(new_dev_maps);
2171         return -ENOMEM;
2172 }
2173 EXPORT_SYMBOL(netif_set_xps_queue);
2174
2175 #endif
2176 /*
2177  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2178  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2179  */
2180 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2181 {
2182         int rc;
2183
2184         if (txq < 1 || txq > dev->num_tx_queues)
2185                 return -EINVAL;
2186
2187         if (dev->reg_state == NETREG_REGISTERED ||
2188             dev->reg_state == NETREG_UNREGISTERING) {
2189                 ASSERT_RTNL();
2190
2191                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2192                                                   txq);
2193                 if (rc)
2194                         return rc;
2195
2196                 if (dev->num_tc)
2197                         netif_setup_tc(dev, txq);
2198
2199                 if (txq < dev->real_num_tx_queues) {
2200                         qdisc_reset_all_tx_gt(dev, txq);
2201 #ifdef CONFIG_XPS
2202                         netif_reset_xps_queues_gt(dev, txq);
2203 #endif
2204                 }
2205         }
2206
2207         dev->real_num_tx_queues = txq;
2208         return 0;
2209 }
2210 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2211
2212 #ifdef CONFIG_SYSFS
2213 /**
2214  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2215  *      @dev: Network device
2216  *      @rxq: Actual number of RX queues
2217  *
2218  *      This must be called either with the rtnl_lock held or before
2219  *      registration of the net device.  Returns 0 on success, or a
2220  *      negative error code.  If called before registration, it always
2221  *      succeeds.
2222  */
2223 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2224 {
2225         int rc;
2226
2227         if (rxq < 1 || rxq > dev->num_rx_queues)
2228                 return -EINVAL;
2229
2230         if (dev->reg_state == NETREG_REGISTERED) {
2231                 ASSERT_RTNL();
2232
2233                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2234                                                   rxq);
2235                 if (rc)
2236                         return rc;
2237         }
2238
2239         dev->real_num_rx_queues = rxq;
2240         return 0;
2241 }
2242 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2243 #endif
2244
2245 /**
2246  * netif_get_num_default_rss_queues - default number of RSS queues
2247  *
2248  * This routine should set an upper limit on the number of RSS queues
2249  * used by default by multiqueue devices.
2250  */
2251 int netif_get_num_default_rss_queues(void)
2252 {
2253         return is_kdump_kernel() ?
2254                 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2255 }
2256 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2257
2258 static void __netif_reschedule(struct Qdisc *q)
2259 {
2260         struct softnet_data *sd;
2261         unsigned long flags;
2262
2263         local_irq_save(flags);
2264         sd = this_cpu_ptr(&softnet_data);
2265         q->next_sched = NULL;
2266         *sd->output_queue_tailp = q;
2267         sd->output_queue_tailp = &q->next_sched;
2268         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2269         local_irq_restore(flags);
2270 }
2271
2272 void __netif_schedule(struct Qdisc *q)
2273 {
2274         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2275                 __netif_reschedule(q);
2276 }
2277 EXPORT_SYMBOL(__netif_schedule);
2278
2279 struct dev_kfree_skb_cb {
2280         enum skb_free_reason reason;
2281 };
2282
2283 static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
2284 {
2285         return (struct dev_kfree_skb_cb *)skb->cb;
2286 }
2287
2288 void netif_schedule_queue(struct netdev_queue *txq)
2289 {
2290         rcu_read_lock();
2291         if (!(txq->state & QUEUE_STATE_ANY_XOFF)) {
2292                 struct Qdisc *q = rcu_dereference(txq->qdisc);
2293
2294                 __netif_schedule(q);
2295         }
2296         rcu_read_unlock();
2297 }
2298 EXPORT_SYMBOL(netif_schedule_queue);
2299
2300 /**
2301  *      netif_wake_subqueue - allow sending packets on subqueue
2302  *      @dev: network device
2303  *      @queue_index: sub queue index
2304  *
2305  * Resume individual transmit queue of a device with multiple transmit queues.
2306  */
2307 void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
2308 {
2309         struct netdev_queue *txq = netdev_get_tx_queue(dev, queue_index);
2310
2311         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &txq->state)) {
2312                 struct Qdisc *q;
2313
2314                 rcu_read_lock();
2315                 q = rcu_dereference(txq->qdisc);
2316                 __netif_schedule(q);
2317                 rcu_read_unlock();
2318         }
2319 }
2320 EXPORT_SYMBOL(netif_wake_subqueue);
2321
2322 void netif_tx_wake_queue(struct netdev_queue *dev_queue)
2323 {
2324         if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
2325                 struct Qdisc *q;
2326
2327                 rcu_read_lock();
2328                 q = rcu_dereference(dev_queue->qdisc);
2329                 __netif_schedule(q);
2330                 rcu_read_unlock();
2331         }
2332 }
2333 EXPORT_SYMBOL(netif_tx_wake_queue);
2334
2335 void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
2336 {
2337         unsigned long flags;
2338
2339         if (likely(atomic_read(&skb->users) == 1)) {
2340                 smp_rmb();
2341                 atomic_set(&skb->users, 0);
2342         } else if (likely(!atomic_dec_and_test(&skb->users))) {
2343                 return;
2344         }
2345         get_kfree_skb_cb(skb)->reason = reason;
2346         local_irq_save(flags);
2347         skb->next = __this_cpu_read(softnet_data.completion_queue);
2348         __this_cpu_write(softnet_data.completion_queue, skb);
2349         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2350         local_irq_restore(flags);
2351 }
2352 EXPORT_SYMBOL(__dev_kfree_skb_irq);
2353
2354 void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
2355 {
2356         if (in_irq() || irqs_disabled())
2357                 __dev_kfree_skb_irq(skb, reason);
2358         else
2359                 dev_kfree_skb(skb);
2360 }
2361 EXPORT_SYMBOL(__dev_kfree_skb_any);
2362
2363
2364 /**
2365  * netif_device_detach - mark device as removed
2366  * @dev: network device
2367  *
2368  * Mark device as removed from system and therefore no longer available.
2369  */
2370 void netif_device_detach(struct net_device *dev)
2371 {
2372         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2373             netif_running(dev)) {
2374                 netif_tx_stop_all_queues(dev);
2375         }
2376 }
2377 EXPORT_SYMBOL(netif_device_detach);
2378
2379 /**
2380  * netif_device_attach - mark device as attached
2381  * @dev: network device
2382  *
2383  * Mark device as attached from system and restart if needed.
2384  */
2385 void netif_device_attach(struct net_device *dev)
2386 {
2387         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2388             netif_running(dev)) {
2389                 netif_tx_wake_all_queues(dev);
2390                 __netdev_watchdog_up(dev);
2391         }
2392 }
2393 EXPORT_SYMBOL(netif_device_attach);
2394
2395 /*
2396  * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2397  * to be used as a distribution range.
2398  */
2399 u16 __skb_tx_hash(const struct net_device *dev, struct sk_buff *skb,
2400                   unsigned int num_tx_queues)
2401 {
2402         u32 hash;
2403         u16 qoffset = 0;
2404         u16 qcount = num_tx_queues;
2405
2406         if (skb_rx_queue_recorded(skb)) {
2407                 hash = skb_get_rx_queue(skb);
2408                 while (unlikely(hash >= num_tx_queues))
2409                         hash -= num_tx_queues;
2410                 return hash;
2411         }
2412
2413         if (dev->num_tc) {
2414                 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2415                 qoffset = dev->tc_to_txq[tc].offset;
2416                 qcount = dev->tc_to_txq[tc].count;
2417         }
2418
2419         return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
2420 }
2421 EXPORT_SYMBOL(__skb_tx_hash);
2422
2423 static void skb_warn_bad_offload(const struct sk_buff *skb)
2424 {
2425         static const netdev_features_t null_features;
2426         struct net_device *dev = skb->dev;
2427         const char *name = "";
2428
2429         if (!net_ratelimit())
2430                 return;
2431
2432         if (dev) {
2433                 if (dev->dev.parent)
2434                         name = dev_driver_string(dev->dev.parent);
2435                 else
2436                         name = netdev_name(dev);
2437         }
2438         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2439              "gso_type=%d ip_summed=%d\n",
2440              name, dev ? &dev->features : &null_features,
2441              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2442              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2443              skb_shinfo(skb)->gso_type, skb->ip_summed);
2444 }
2445
2446 /*
2447  * Invalidate hardware checksum when packet is to be mangled, and
2448  * complete checksum manually on outgoing path.
2449  */
2450 int skb_checksum_help(struct sk_buff *skb)
2451 {
2452         __wsum csum;
2453         int ret = 0, offset;
2454
2455         if (skb->ip_summed == CHECKSUM_COMPLETE)
2456                 goto out_set_summed;
2457
2458         if (unlikely(skb_shinfo(skb)->gso_size)) {
2459                 skb_warn_bad_offload(skb);
2460                 return -EINVAL;
2461         }
2462
2463         /* Before computing a checksum, we should make sure no frag could
2464          * be modified by an external entity : checksum could be wrong.
2465          */
2466         if (skb_has_shared_frag(skb)) {
2467                 ret = __skb_linearize(skb);
2468                 if (ret)
2469                         goto out;
2470         }
2471
2472         offset = skb_checksum_start_offset(skb);
2473         BUG_ON(offset >= skb_headlen(skb));
2474         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2475
2476         offset += skb->csum_offset;
2477         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2478
2479         if (skb_cloned(skb) &&
2480             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2481                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2482                 if (ret)
2483                         goto out;
2484         }
2485
2486         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2487 out_set_summed:
2488         skb->ip_summed = CHECKSUM_NONE;
2489 out:
2490         return ret;
2491 }
2492 EXPORT_SYMBOL(skb_checksum_help);
2493
2494 __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
2495 {
2496         __be16 type = skb->protocol;
2497
2498         /* Tunnel gso handlers can set protocol to ethernet. */
2499         if (type == htons(ETH_P_TEB)) {
2500                 struct ethhdr *eth;
2501
2502                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2503                         return 0;
2504
2505                 eth = (struct ethhdr *)skb_mac_header(skb);
2506                 type = eth->h_proto;
2507         }
2508
2509         return __vlan_get_protocol(skb, type, depth);
2510 }
2511
2512 /**
2513  *      skb_mac_gso_segment - mac layer segmentation handler.
2514  *      @skb: buffer to segment
2515  *      @features: features for the output path (see dev->features)
2516  */
2517 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2518                                     netdev_features_t features)
2519 {
2520         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2521         struct packet_offload *ptype;
2522         int vlan_depth = skb->mac_len;
2523         __be16 type = skb_network_protocol(skb, &vlan_depth);
2524
2525         if (unlikely(!type))
2526                 return ERR_PTR(-EINVAL);
2527
2528         __skb_pull(skb, vlan_depth);
2529
2530         rcu_read_lock();
2531         list_for_each_entry_rcu(ptype, &offload_base, list) {
2532                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2533                         segs = ptype->callbacks.gso_segment(skb, features);
2534                         break;
2535                 }
2536         }
2537         rcu_read_unlock();
2538
2539         __skb_push(skb, skb->data - skb_mac_header(skb));
2540
2541         return segs;
2542 }
2543 EXPORT_SYMBOL(skb_mac_gso_segment);
2544
2545
2546 /* openvswitch calls this on rx path, so we need a different check.
2547  */
2548 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2549 {
2550         if (tx_path)
2551                 return skb->ip_summed != CHECKSUM_PARTIAL;
2552         else
2553                 return skb->ip_summed == CHECKSUM_NONE;
2554 }
2555
2556 /**
2557  *      __skb_gso_segment - Perform segmentation on skb.
2558  *      @skb: buffer to segment
2559  *      @features: features for the output path (see dev->features)
2560  *      @tx_path: whether it is called in TX path
2561  *
2562  *      This function segments the given skb and returns a list of segments.
2563  *
2564  *      It may return NULL if the skb requires no segmentation.  This is
2565  *      only possible when GSO is used for verifying header integrity.
2566  *
2567  *      Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
2568  */
2569 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2570                                   netdev_features_t features, bool tx_path)
2571 {
2572         if (unlikely(skb_needs_check(skb, tx_path))) {
2573                 int err;
2574
2575                 skb_warn_bad_offload(skb);
2576
2577                 err = skb_cow_head(skb, 0);
2578                 if (err < 0)
2579                         return ERR_PTR(err);
2580         }
2581
2582         /* Only report GSO partial support if it will enable us to
2583          * support segmentation on this frame without needing additional
2584          * work.
2585          */
2586         if (features & NETIF_F_GSO_PARTIAL) {
2587                 netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
2588                 struct net_device *dev = skb->dev;
2589
2590                 partial_features |= dev->features & dev->gso_partial_features;
2591                 if (!skb_gso_ok(skb, features | partial_features))
2592                         features &= ~NETIF_F_GSO_PARTIAL;
2593         }
2594
2595         BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
2596                      sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
2597
2598         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2599         SKB_GSO_CB(skb)->encap_level = 0;
2600
2601         skb_reset_mac_header(skb);
2602         skb_reset_mac_len(skb);
2603
2604         return skb_mac_gso_segment(skb, features);
2605 }
2606 EXPORT_SYMBOL(__skb_gso_segment);
2607
2608 /* Take action when hardware reception checksum errors are detected. */
2609 #ifdef CONFIG_BUG
2610 void netdev_rx_csum_fault(struct net_device *dev)
2611 {
2612         if (net_ratelimit()) {
2613                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2614                 dump_stack();
2615         }
2616 }
2617 EXPORT_SYMBOL(netdev_rx_csum_fault);
2618 #endif
2619
2620 /* Actually, we should eliminate this check as soon as we know, that:
2621  * 1. IOMMU is present and allows to map all the memory.
2622  * 2. No high memory really exists on this machine.
2623  */
2624
2625 static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
2626 {
2627 #ifdef CONFIG_HIGHMEM
2628         int i;
2629         if (!(dev->features & NETIF_F_HIGHDMA)) {
2630                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2631                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2632                         if (PageHighMem(skb_frag_page(frag)))
2633                                 return 1;
2634                 }
2635         }
2636
2637         if (PCI_DMA_BUS_IS_PHYS) {
2638                 struct device *pdev = dev->dev.parent;
2639
2640                 if (!pdev)
2641                         return 0;
2642                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2643                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2644                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2645                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2646                                 return 1;
2647                 }
2648         }
2649 #endif
2650         return 0;
2651 }
2652
2653 /* If MPLS offload request, verify we are testing hardware MPLS features
2654  * instead of standard features for the netdev.
2655  */
2656 #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
2657 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2658                                            netdev_features_t features,
2659                                            __be16 type)
2660 {
2661         if (eth_p_mpls(type))
2662                 features &= skb->dev->mpls_features;
2663
2664         return features;
2665 }
2666 #else
2667 static netdev_features_t net_mpls_features(struct sk_buff *skb,
2668                                            netdev_features_t features,
2669                                            __be16 type)
2670 {
2671         return features;
2672 }
2673 #endif
2674
2675 static netdev_features_t harmonize_features(struct sk_buff *skb,
2676         netdev_features_t features)
2677 {
2678         int tmp;
2679         __be16 type;
2680
2681         type = skb_network_protocol(skb, &tmp);
2682         features = net_mpls_features(skb, features, type);
2683
2684         if (skb->ip_summed != CHECKSUM_NONE &&
2685             !can_checksum_protocol(features, type)) {
2686                 features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
2687         } else if (illegal_highdma(skb->dev, skb)) {
2688                 features &= ~NETIF_F_SG;
2689         }
2690
2691         return features;
2692 }
2693
2694 netdev_features_t passthru_features_check(struct sk_buff *skb,
2695                                           struct net_device *dev,
2696                                           netdev_features_t features)
2697 {
2698         return features;
2699 }
2700 EXPORT_SYMBOL(passthru_features_check);
2701
2702 static netdev_features_t dflt_features_check(const struct sk_buff *skb,
2703                                              struct net_device *dev,
2704                                              netdev_features_t features)
2705 {
2706         return vlan_features_check(skb, features);
2707 }
2708
2709 static netdev_features_t gso_features_check(const struct sk_buff *skb,
2710                                             struct net_device *dev,
2711                                             netdev_features_t features)
2712 {
2713         u16 gso_segs = skb_shinfo(skb)->gso_segs;
2714
2715         if (gso_segs > dev->gso_max_segs)
2716                 return features & ~NETIF_F_GSO_MASK;
2717
2718         /* Support for GSO partial features requires software
2719          * intervention before we can actually process the packets
2720          * so we need to strip support for any partial features now
2721          * and we can pull them back in after we have partially
2722          * segmented the frame.
2723          */
2724         if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
2725                 features &= ~dev->gso_partial_features;
2726
2727         /* Make sure to clear the IPv4 ID mangling feature if the
2728          * IPv4 header has the potential to be fragmented.
2729          */
2730         if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
2731                 struct iphdr *iph = skb->encapsulation ?
2732                                     inner_ip_hdr(skb) : ip_hdr(skb);
2733
2734                 if (!(iph->frag_off & htons(IP_DF)))
2735                         features &= ~NETIF_F_TSO_MANGLEID;
2736         }
2737
2738         return features;
2739 }
2740
2741 netdev_features_t netif_skb_features(struct sk_buff *skb)
2742 {
2743         struct net_device *dev = skb->dev;
2744         netdev_features_t features = dev->features;
2745
2746         if (skb_is_gso(skb))
2747                 features = gso_features_check(skb, dev, features);
2748
2749         /* If encapsulation offload request, verify we are testing
2750          * hardware encapsulation features instead of standard
2751          * features for the netdev
2752          */
2753         if (skb->encapsulation)
2754                 features &= dev->hw_enc_features;
2755
2756         if (skb_vlan_tagged(skb))
2757                 features = netdev_intersect_features(features,
2758                                                      dev->vlan_features |
2759                                                      NETIF_F_HW_VLAN_CTAG_TX |
2760                                                      NETIF_F_HW_VLAN_STAG_TX);
2761
2762         if (dev->netdev_ops->ndo_features_check)
2763                 features &= dev->netdev_ops->ndo_features_check(skb, dev,
2764                                                                 features);
2765         else
2766                 features &= dflt_features_check(skb, dev, features);
2767
2768         return harmonize_features(skb, features);
2769 }
2770 EXPORT_SYMBOL(netif_skb_features);
2771
2772 static int xmit_one(struct sk_buff *skb, struct net_device *dev,
2773                     struct netdev_queue *txq, bool more)
2774 {
2775         unsigned int len;
2776         int rc;
2777
2778         if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all))
2779                 dev_queue_xmit_nit(skb, dev);
2780
2781         len = skb->len;
2782         trace_net_dev_start_xmit(skb, dev);
2783         rc = netdev_start_xmit(skb, dev, txq, more);
2784         trace_net_dev_xmit(skb, rc, dev, len);
2785
2786         return rc;
2787 }
2788
2789 struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
2790                                     struct netdev_queue *txq, int *ret)
2791 {
2792         struct sk_buff *skb = first;
2793         int rc = NETDEV_TX_OK;
2794
2795         while (skb) {
2796                 struct sk_buff *next = skb->next;
2797
2798                 skb->next = NULL;
2799                 rc = xmit_one(skb, dev, txq, next != NULL);
2800                 if (unlikely(!dev_xmit_complete(rc))) {
2801                         skb->next = next;
2802                         goto out;
2803                 }
2804
2805                 skb = next;
2806                 if (netif_xmit_stopped(txq) && skb) {
2807                         rc = NETDEV_TX_BUSY;
2808                         break;
2809                 }
2810         }
2811
2812 out:
2813         *ret = rc;
2814         return skb;
2815 }
2816
2817 static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
2818                                           netdev_features_t features)
2819 {
2820         if (skb_vlan_tag_present(skb) &&
2821             !vlan_hw_offload_capable(features, skb->vlan_proto))
2822                 skb = __vlan_hwaccel_push_inside(skb);
2823         return skb;
2824 }
2825
2826 static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev)
2827 {
2828         netdev_features_t features;
2829
2830         features = netif_skb_features(skb);
2831         skb = validate_xmit_vlan(skb, features);
2832         if (unlikely(!skb))
2833                 goto out_null;
2834
2835         if (netif_needs_gso(skb, features)) {
2836                 struct sk_buff *segs;
2837
2838                 segs = skb_gso_segment(skb, features);
2839                 if (IS_ERR(segs)) {
2840                         goto out_kfree_skb;
2841                 } else if (segs) {
2842                         consume_skb(skb);
2843                         skb = segs;
2844                 }
2845         } else {
2846                 if (skb_needs_linearize(skb, features) &&
2847                     __skb_linearize(skb))
2848                         goto out_kfree_skb;
2849
2850                 /* If packet is not checksummed and device does not
2851                  * support checksumming for this protocol, complete
2852                  * checksumming here.
2853                  */
2854                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2855                         if (skb->encapsulation)
2856                                 skb_set_inner_transport_header(skb,
2857                                                                skb_checksum_start_offset(skb));
2858                         else
2859                                 skb_set_transport_header(skb,
2860                                                          skb_checksum_start_offset(skb));
2861                         if (!(features & NETIF_F_CSUM_MASK) &&
2862                             skb_checksum_help(skb))
2863                                 goto out_kfree_skb;
2864                 }
2865         }
2866
2867         return skb;
2868
2869 out_kfree_skb:
2870         kfree_skb(skb);
2871 out_null:
2872         atomic_long_inc(&dev->tx_dropped);
2873         return NULL;
2874 }
2875
2876 struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev)
2877 {
2878         struct sk_buff *next, *head = NULL, *tail;
2879
2880         for (; skb != NULL; skb = next) {
2881                 next = skb->next;
2882                 skb->next = NULL;
2883
2884                 /* in case skb wont be segmented, point to itself */
2885                 skb->prev = skb;
2886
2887                 skb = validate_xmit_skb(skb, dev);
2888                 if (!skb)
2889                         continue;
2890
2891                 if (!head)
2892                         head = skb;
2893                 else
2894                         tail->next = skb;
2895                 /* If skb was segmented, skb->prev points to
2896                  * the last segment. If not, it still contains skb.
2897                  */
2898                 tail = skb->prev;
2899         }
2900         return head;
2901 }
2902
2903 static void qdisc_pkt_len_init(struct sk_buff *skb)
2904 {
2905         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2906
2907         qdisc_skb_cb(skb)->pkt_len = skb->len;
2908
2909         /* To get more precise estimation of bytes sent on wire,
2910          * we add to pkt_len the headers size of all segments
2911          */
2912         if (shinfo->gso_size)  {
2913                 unsigned int hdr_len;
2914                 u16 gso_segs = shinfo->gso_segs;
2915
2916                 /* mac layer + network layer */
2917                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2918
2919                 /* + transport layer */
2920                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2921                         hdr_len += tcp_hdrlen(skb);
2922                 else
2923                         hdr_len += sizeof(struct udphdr);
2924
2925                 if (shinfo->gso_type & SKB_GSO_DODGY)
2926                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2927                                                 shinfo->gso_size);
2928
2929                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2930         }
2931 }
2932
2933 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2934                                  struct net_device *dev,
2935                                  struct netdev_queue *txq)
2936 {
2937         spinlock_t *root_lock = qdisc_lock(q);
2938         struct sk_buff *to_free = NULL;
2939         bool contended;
2940         int rc;
2941
2942         qdisc_calculate_pkt_len(skb, q);
2943         /*
2944          * Heuristic to force contended enqueues to serialize on a
2945          * separate lock before trying to get qdisc main lock.
2946          * This permits qdisc->running owner to get the lock more
2947          * often and dequeue packets faster.
2948          */
2949         contended = qdisc_is_running(q);
2950         if (unlikely(contended))
2951                 spin_lock(&q->busylock);
2952
2953         spin_lock(root_lock);
2954         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2955                 __qdisc_drop(skb, &to_free);
2956                 rc = NET_XMIT_DROP;
2957         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2958                    qdisc_run_begin(q)) {
2959                 /*
2960                  * This is a work-conserving queue; there are no old skbs
2961                  * waiting to be sent out; and the qdisc is not running -
2962                  * xmit the skb directly.
2963                  */
2964
2965                 qdisc_bstats_update(q, skb);
2966
2967                 if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
2968                         if (unlikely(contended)) {
2969                                 spin_unlock(&q->busylock);
2970                                 contended = false;
2971                         }
2972                         __qdisc_run(q);
2973                 } else
2974                         qdisc_run_end(q);
2975
2976                 rc = NET_XMIT_SUCCESS;
2977         } else {
2978                 rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
2979                 if (qdisc_run_begin(q)) {
2980                         if (unlikely(contended)) {
2981                                 spin_unlock(&q->busylock);
2982                                 contended = false;
2983                         }
2984                         __qdisc_run(q);
2985                 }
2986         }
2987         spin_unlock(root_lock);
2988         if (unlikely(to_free))
2989                 kfree_skb_list(to_free);
2990         if (unlikely(contended))
2991                 spin_unlock(&q->busylock);
2992         return rc;
2993 }
2994
2995 #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
2996 static void skb_update_prio(struct sk_buff *skb)
2997 {
2998         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2999
3000         if (!skb->priority && skb->sk && map) {
3001                 unsigned int prioidx =
3002                         sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
3003
3004                 if (prioidx < map->priomap_len)
3005                         skb->priority = map->priomap[prioidx];
3006         }
3007 }
3008 #else
3009 #define skb_update_prio(skb)
3010 #endif
3011
3012 DEFINE_PER_CPU(int, xmit_recursion);
3013 EXPORT_SYMBOL(xmit_recursion);
3014
3015 /**
3016  *      dev_loopback_xmit - loop back @skb
3017  *      @net: network namespace this loopback is happening in
3018  *      @sk:  sk needed to be a netfilter okfn
3019  *      @skb: buffer to transmit
3020  */
3021 int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3022 {
3023         skb_reset_mac_header(skb);
3024         __skb_pull(skb, skb_network_offset(skb));
3025         skb->pkt_type = PACKET_LOOPBACK;
3026         skb->ip_summed = CHECKSUM_UNNECESSARY;
3027         WARN_ON(!skb_dst(skb));
3028         skb_dst_force(skb);
3029         netif_rx_ni(skb);
3030         return 0;
3031 }
3032 EXPORT_SYMBOL(dev_loopback_xmit);
3033
3034 #ifdef CONFIG_NET_EGRESS
3035 static struct sk_buff *
3036 sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3037 {
3038         struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
3039         struct tcf_result cl_res;
3040
3041         if (!cl)
3042                 return skb;
3043
3044         /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
3045          * earlier by the caller.
3046          */
3047         qdisc_bstats_cpu_update(cl->q, skb);
3048
3049         switch (tc_classify(skb, cl, &cl_res, false)) {
3050         case TC_ACT_OK:
3051         case TC_ACT_RECLASSIFY:
3052                 skb->tc_index = TC_H_MIN(cl_res.classid);
3053                 break;
3054         case TC_ACT_SHOT:
3055                 qdisc_qstats_cpu_drop(cl->q);
3056                 *ret = NET_XMIT_DROP;
3057                 kfree_skb(skb);
3058                 return NULL;
3059         case TC_ACT_STOLEN:
3060         case TC_ACT_QUEUED:
3061                 *ret = NET_XMIT_SUCCESS;
3062                 consume_skb(skb);
3063                 return NULL;
3064         case TC_ACT_REDIRECT:
3065                 /* No need to push/pop skb's mac_header here on egress! */
3066                 skb_do_redirect(skb);
3067                 *ret = NET_XMIT_SUCCESS;
3068                 return NULL;
3069         default:
3070                 break;
3071         }
3072
3073         return skb;
3074 }
3075 #endif /* CONFIG_NET_EGRESS */
3076
3077 static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
3078 {
3079 #ifdef CONFIG_XPS
3080         struct xps_dev_maps *dev_maps;
3081         struct xps_map *map;
3082         int queue_index = -1;
3083
3084         rcu_read_lock();
3085         dev_maps = rcu_dereference(dev->xps_maps);
3086         if (dev_maps) {
3087                 map = rcu_dereference(
3088                     dev_maps->cpu_map[skb->sender_cpu - 1]);
3089                 if (map) {
3090                         if (map->len == 1)
3091                                 queue_index = map->queues[0];
3092                         else
3093                                 queue_index = map->queues[reciprocal_scale(skb_get_hash(skb),
3094                                                                            map->len)];
3095                         if (unlikely(queue_index >= dev->real_num_tx_queues))
3096                                 queue_index = -1;
3097                 }
3098         }
3099         rcu_read_unlock();
3100
3101         return queue_index;
3102 #else
3103         return -1;
3104 #endif
3105 }
3106
3107 static u16 __netdev_pick_tx(struct net_device *dev, struct sk_buff *skb)
3108 {
3109         struct sock *sk = skb->sk;
3110         int queue_index = sk_tx_queue_get(sk);
3111
3112         if (queue_index < 0 || skb->ooo_okay ||
3113             queue_index >= dev->real_num_tx_queues) {
3114                 int new_index = get_xps_queue(dev, skb);
3115                 if (new_index < 0)
3116                         new_index = skb_tx_hash(dev, skb);
3117
3118                 if (queue_index != new_index && sk &&
3119                     sk_fullsock(sk) &&
3120                     rcu_access_pointer(sk->sk_dst_cache))
3121                         sk_tx_queue_set(sk, new_index);
3122
3123                 queue_index = new_index;
3124         }
3125
3126         return queue_index;
3127 }
3128
3129 struct netdev_queue *netdev_pick_tx(struct net_device *dev,
3130                                     struct sk_buff *skb,
3131                                     void *accel_priv)
3132 {
3133         int queue_index = 0;
3134
3135 #ifdef CONFIG_XPS
3136         u32 sender_cpu = skb->sender_cpu - 1;
3137
3138         if (sender_cpu >= (u32)NR_CPUS)
3139                 skb->sender_cpu = raw_smp_processor_id() + 1;
3140 #endif
3141
3142         if (dev->real_num_tx_queues != 1) {
3143                 const struct net_device_ops *ops = dev->netdev_ops;
3144                 if (ops->ndo_select_queue)
3145                         queue_index = ops->ndo_select_queue(dev, skb, accel_priv,
3146                                                             __netdev_pick_tx);
3147                 else
3148                         queue_index = __netdev_pick_tx(dev, skb);
3149
3150                 if (!accel_priv)
3151                         queue_index = netdev_cap_txqueue(dev, queue_index);
3152         }
3153
3154         skb_set_queue_mapping(skb, queue_index);
3155         return netdev_get_tx_queue(dev, queue_index);
3156 }
3157
3158 /**
3159  *      __dev_queue_xmit - transmit a buffer
3160  *      @skb: buffer to transmit
3161  *      @accel_priv: private data used for L2 forwarding offload
3162  *
3163  *      Queue a buffer for transmission to a network device. The caller must
3164  *      have set the device and priority and built the buffer before calling
3165  *      this function. The function can be called from an interrupt.
3166  *
3167  *      A negative errno code is returned on a failure. A success does not
3168  *      guarantee the frame will be transmitted as it may be dropped due
3169  *      to congestion or traffic shaping.
3170  *
3171  * -----------------------------------------------------------------------------------
3172  *      I notice this method can also return errors from the queue disciplines,
3173  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
3174  *      be positive.
3175  *
3176  *      Regardless of the return value, the skb is consumed, so it is currently
3177  *      difficult to retry a send to this method.  (You can bump the ref count
3178  *      before sending to hold a reference for retry if you are careful.)
3179  *
3180  *      When calling this method, interrupts MUST be enabled.  This is because
3181  *      the BH enable code must have IRQs enabled so that it will not deadlock.
3182  *          --BLG
3183  */
3184 static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
3185 {
3186         struct net_device *dev = skb->dev;
3187         struct netdev_queue *txq;
3188         struct Qdisc *q;
3189         int rc = -ENOMEM;
3190
3191         skb_reset_mac_header(skb);
3192
3193         if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
3194                 __skb_tstamp_tx(skb, NULL, skb->sk, SCM_TSTAMP_SCHED);
3195
3196         /* Disable soft irqs for various locks below. Also
3197          * stops preemption for RCU.
3198          */
3199         rcu_read_lock_bh();
3200
3201         skb_update_prio(skb);
3202
3203         qdisc_pkt_len_init(skb);
3204 #ifdef CONFIG_NET_CLS_ACT
3205         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
3206 # ifdef CONFIG_NET_EGRESS
3207         if (static_key_false(&egress_needed)) {
3208                 skb = sch_handle_egress(skb, &rc, dev);
3209                 if (!skb)
3210                         goto out;
3211         }
3212 # endif
3213 #endif
3214         /* If device/qdisc don't need skb->dst, release it right now while
3215          * its hot in this cpu cache.
3216          */
3217         if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
3218                 skb_dst_drop(skb);
3219         else
3220                 skb_dst_force(skb);
3221
3222         txq = netdev_pick_tx(dev, skb, accel_priv);
3223         q = rcu_dereference_bh(txq->qdisc);
3224
3225         trace_net_dev_queue(skb);
3226         if (q->enqueue) {
3227                 rc = __dev_xmit_skb(skb, q, dev, txq);
3228                 goto out;
3229         }
3230
3231         /* The device has no queue. Common case for software devices:
3232            loopback, all the sorts of tunnels...
3233
3234            Really, it is unlikely that netif_tx_lock protection is necessary
3235            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
3236            counters.)
3237            However, it is possible, that they rely on protection
3238            made by us here.
3239
3240            Check this and shot the lock. It is not prone from deadlocks.
3241            Either shot noqueue qdisc, it is even simpler 8)
3242          */
3243         if (dev->flags & IFF_UP) {
3244                 int cpu = smp_processor_id(); /* ok because BHs are off */
3245
3246                 if (txq->xmit_lock_owner != cpu) {
3247                         if (unlikely(__this_cpu_read(xmit_recursion) >
3248                                      XMIT_RECURSION_LIMIT))
3249                                 goto recursion_alert;
3250
3251                         skb = validate_xmit_skb(skb, dev);
3252                         if (!skb)
3253                                 goto out;
3254
3255                         HARD_TX_LOCK(dev, txq, cpu);
3256
3257                         if (!netif_xmit_stopped(txq)) {
3258                                 __this_cpu_inc(xmit_recursion);
3259                                 skb = dev_hard_start_xmit(skb, dev, txq, &rc);
3260                                 __this_cpu_dec(xmit_recursion);
3261                                 if (dev_xmit_complete(rc)) {
3262                                         HARD_TX_UNLOCK(dev, txq);
3263                                         goto out;
3264                                 }
3265                         }
3266                         HARD_TX_UNLOCK(dev, txq);
3267                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
3268                                              dev->name);
3269                 } else {
3270                         /* Recursion is detected! It is possible,
3271                          * unfortunately
3272                          */
3273 recursion_alert:
3274                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
3275                                              dev->name);
3276                 }
3277         }
3278
3279         rc = -ENETDOWN;
3280         rcu_read_unlock_bh();
3281
3282         atomic_long_inc(&dev->tx_dropped);
3283         kfree_skb_list(skb);
3284         return rc;
3285 out:
3286         rcu_read_unlock_bh();
3287         return rc;
3288 }
3289
3290 int dev_queue_xmit(struct sk_buff *skb)
3291 {
3292         return __dev_queue_xmit(skb, NULL);
3293 }
3294 EXPORT_SYMBOL(dev_queue_xmit);
3295
3296 int dev_queue_xmit_accel(struct sk_buff *skb, void *accel_priv)
3297 {
3298         return __dev_queue_xmit(skb, accel_priv);
3299 }
3300 EXPORT_SYMBOL(dev_queue_xmit_accel);
3301
3302
3303 /*=======================================================================
3304                         Receiver routines
3305   =======================================================================*/
3306
3307 int netdev_max_backlog __read_mostly = 1000;
3308 EXPORT_SYMBOL(netdev_max_backlog);
3309
3310 int netdev_tstamp_prequeue __read_mostly = 1;
3311 int netdev_budget __read_mostly = 300;
3312 int weight_p __read_mostly = 64;            /* old backlog weight */
3313
3314 /* Called with irq disabled */
3315 static inline void ____napi_schedule(struct softnet_data *sd,
3316                                      struct napi_struct *napi)
3317 {
3318         list_add_tail(&napi->poll_list, &sd->poll_list);
3319         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3320 }
3321
3322 #ifdef CONFIG_RPS
3323
3324 /* One global table that all flow-based protocols share. */
3325 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
3326 EXPORT_SYMBOL(rps_sock_flow_table);
3327 u32 rps_cpu_mask __read_mostly;
3328 EXPORT_SYMBOL(rps_cpu_mask);
3329
3330 struct static_key rps_needed __read_mostly;
3331 EXPORT_SYMBOL(rps_needed);
3332
3333 static struct rps_dev_flow *
3334 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3335             struct rps_dev_flow *rflow, u16 next_cpu)
3336 {
3337         if (next_cpu < nr_cpu_ids) {
3338 #ifdef CONFIG_RFS_ACCEL
3339                 struct netdev_rx_queue *rxqueue;
3340                 struct rps_dev_flow_table *flow_table;
3341                 struct rps_dev_flow *old_rflow;
3342                 u32 flow_id;
3343                 u16 rxq_index;
3344                 int rc;
3345
3346                 /* Should we steer this flow to a different hardware queue? */
3347                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
3348                     !(dev->features & NETIF_F_NTUPLE))
3349                         goto out;
3350                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
3351                 if (rxq_index == skb_get_rx_queue(skb))
3352                         goto out;
3353
3354                 rxqueue = dev->_rx + rxq_index;
3355                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3356                 if (!flow_table)
3357                         goto out;
3358                 flow_id = skb_get_hash(skb) & flow_table->mask;
3359                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
3360                                                         rxq_index, flow_id);
3361                 if (rc < 0)
3362                         goto out;
3363                 old_rflow = rflow;
3364                 rflow = &flow_table->flows[flow_id];
3365                 rflow->filter = rc;
3366                 if (old_rflow->filter == rflow->filter)
3367                         old_rflow->filter = RPS_NO_FILTER;
3368         out:
3369 #endif
3370                 rflow->last_qtail =
3371                         per_cpu(softnet_data, next_cpu).input_queue_head;
3372         }
3373
3374         rflow->cpu = next_cpu;
3375         return rflow;
3376 }
3377
3378 /*
3379  * get_rps_cpu is called from netif_receive_skb and returns the target
3380  * CPU from the RPS map of the receiving queue for a given skb.
3381  * rcu_read_lock must be held on entry.
3382  */
3383 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
3384                        struct rps_dev_flow **rflowp)
3385 {
3386         const struct rps_sock_flow_table *sock_flow_table;
3387         struct netdev_rx_queue *rxqueue = dev->_rx;
3388         struct rps_dev_flow_table *flow_table;
3389         struct rps_map *map;
3390         int cpu = -1;
3391         u32 tcpu;
3392         u32 hash;
3393
3394         if (skb_rx_queue_recorded(skb)) {
3395                 u16 index = skb_get_rx_queue(skb);
3396
3397                 if (unlikely(index >= dev->real_num_rx_queues)) {
3398                         WARN_ONCE(dev->real_num_rx_queues > 1,
3399                                   "%s received packet on queue %u, but number "
3400                                   "of RX queues is %u\n",
3401                                   dev->name, index, dev->real_num_rx_queues);
3402                         goto done;
3403                 }
3404                 rxqueue += index;
3405         }
3406
3407         /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
3408
3409         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3410         map = rcu_dereference(rxqueue->rps_map);
3411         if (!flow_table && !map)
3412                 goto done;
3413
3414         skb_reset_network_header(skb);
3415         hash = skb_get_hash(skb);
3416         if (!hash)
3417                 goto done;
3418
3419         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3420         if (flow_table && sock_flow_table) {
3421                 struct rps_dev_flow *rflow;
3422                 u32 next_cpu;
3423                 u32 ident;
3424
3425                 /* First check into global flow table if there is a match */
3426                 ident = sock_flow_table->ents[hash & sock_flow_table->mask];
3427                 if ((ident ^ hash) & ~rps_cpu_mask)
3428                         goto try_rps;
3429
3430                 next_cpu = ident & rps_cpu_mask;
3431
3432                 /* OK, now we know there is a match,
3433                  * we can look at the local (per receive queue) flow table
3434                  */
3435                 rflow = &flow_table->flows[hash & flow_table->mask];
3436                 tcpu = rflow->cpu;
3437
3438                 /*
3439                  * If the desired CPU (where last recvmsg was done) is
3440                  * different from current CPU (one in the rx-queue flow
3441                  * table entry), switch if one of the following holds:
3442                  *   - Current CPU is unset (>= nr_cpu_ids).
3443                  *   - Current CPU is offline.
3444                  *   - The current CPU's queue tail has advanced beyond the
3445                  *     last packet that was enqueued using this table entry.
3446                  *     This guarantees that all previous packets for the flow
3447                  *     have been dequeued, thus preserving in order delivery.
3448                  */
3449                 if (unlikely(tcpu != next_cpu) &&
3450                     (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
3451                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3452                       rflow->last_qtail)) >= 0)) {
3453                         tcpu = next_cpu;
3454                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3455                 }
3456
3457                 if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
3458                         *rflowp = rflow;
3459                         cpu = tcpu;
3460                         goto done;
3461                 }
3462         }
3463
3464 try_rps:
3465
3466         if (map) {
3467                 tcpu = map->cpus[reciprocal_scale(hash, map->len)];
3468                 if (cpu_online(tcpu)) {
3469                         cpu = tcpu;
3470                         goto done;
3471                 }
3472         }
3473
3474 done:
3475         return cpu;
3476 }
3477
3478 #ifdef CONFIG_RFS_ACCEL
3479
3480 /**
3481  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3482  * @dev: Device on which the filter was set
3483  * @rxq_index: RX queue index
3484  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3485  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3486  *
3487  * Drivers that implement ndo_rx_flow_steer() should periodically call
3488  * this function for each installed filter and remove the filters for
3489  * which it returns %true.
3490  */
3491 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3492                          u32 flow_id, u16 filter_id)
3493 {
3494         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3495         struct rps_dev_flow_table *flow_table;
3496         struct rps_dev_flow *rflow;
3497         bool expire = true;
3498         unsigned int cpu;
3499
3500         rcu_read_lock();
3501         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3502         if (flow_table && flow_id <= flow_table->mask) {
3503                 rflow = &flow_table->flows[flow_id];
3504                 cpu = ACCESS_ONCE(rflow->cpu);
3505                 if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
3506                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3507                            rflow->last_qtail) <
3508                      (int)(10 * flow_table->mask)))
3509                         expire = false;
3510         }
3511         rcu_read_unlock();
3512         return expire;
3513 }
3514 EXPORT_SYMBOL(rps_may_expire_flow);
3515
3516 #endif /* CONFIG_RFS_ACCEL */
3517
3518 /* Called from hardirq (IPI) context */
3519 static void rps_trigger_softirq(void *data)
3520 {
3521         struct softnet_data *sd = data;
3522
3523         ____napi_schedule(sd, &sd->backlog);
3524         sd->received_rps++;
3525 }
3526
3527 #endif /* CONFIG_RPS */
3528
3529 /*
3530  * Check if this softnet_data structure is another cpu one
3531  * If yes, queue it to our IPI list and return 1
3532  * If no, return 0
3533  */
3534 static int rps_ipi_queued(struct softnet_data *sd)
3535 {
3536 #ifdef CONFIG_RPS
3537         struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
3538
3539         if (sd != mysd) {
3540                 sd->rps_ipi_next = mysd->rps_ipi_list;
3541                 mysd->rps_ipi_list = sd;
3542
3543                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3544                 return 1;
3545         }
3546 #endif /* CONFIG_RPS */
3547         return 0;
3548 }
3549
3550 #ifdef CONFIG_NET_FLOW_LIMIT
3551 int netdev_flow_limit_table_len __read_mostly = (1 << 12);
3552 #endif
3553
3554 static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
3555 {
3556 #ifdef CONFIG_NET_FLOW_LIMIT
3557         struct sd_flow_limit *fl;
3558         struct softnet_data *sd;
3559         unsigned int old_flow, new_flow;
3560
3561         if (qlen < (netdev_max_backlog >> 1))
3562                 return false;
3563
3564         sd = this_cpu_ptr(&softnet_data);
3565
3566         rcu_read_lock();
3567         fl = rcu_dereference(sd->flow_limit);
3568         if (fl) {
3569                 new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
3570                 old_flow = fl->history[fl->history_head];
3571                 fl->history[fl->history_head] = new_flow;
3572
3573                 fl->history_head++;
3574                 fl->history_head &= FLOW_LIMIT_HISTORY - 1;
3575
3576                 if (likely(fl->buckets[old_flow]))
3577                         fl->buckets[old_flow]--;
3578
3579                 if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
3580                         fl->count++;
3581                         rcu_read_unlock();
3582                         return true;
3583                 }
3584         }
3585         rcu_read_unlock();
3586 #endif
3587         return false;
3588 }
3589
3590 /*
3591  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3592  * queue (may be a remote CPU queue).
3593  */
3594 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3595                               unsigned int *qtail)
3596 {
3597         struct softnet_data *sd;
3598         unsigned long flags;
3599         unsigned int qlen;
3600
3601         sd = &per_cpu(softnet_data, cpu);
3602
3603         local_irq_save(flags);
3604
3605         rps_lock(sd);
3606         if (!netif_running(skb->dev))
3607                 goto drop;
3608         qlen = skb_queue_len(&sd->input_pkt_queue);
3609         if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
3610                 if (qlen) {
3611 enqueue:
3612                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3613                         input_queue_tail_incr_save(sd, qtail);
3614                         rps_unlock(sd);
3615                         local_irq_restore(flags);
3616                         return NET_RX_SUCCESS;
3617                 }
3618
3619                 /* Schedule NAPI for backlog device
3620                  * We can use non atomic operation since we own the queue lock
3621                  */
3622                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3623                         if (!rps_ipi_queued(sd))
3624                                 ____napi_schedule(sd, &sd->backlog);
3625                 }
3626                 goto enqueue;
3627         }
3628
3629 drop:
3630         sd->dropped++;
3631         rps_unlock(sd);
3632
3633         local_irq_restore(flags);
3634
3635         atomic_long_inc(&skb->dev->rx_dropped);
3636         kfree_skb(skb);
3637         return NET_RX_DROP;
3638 }
3639
3640 static int netif_rx_internal(struct sk_buff *skb)
3641 {
3642         int ret;
3643
3644         net_timestamp_check(netdev_tstamp_prequeue, skb);
3645
3646         trace_netif_rx(skb);
3647 #ifdef CONFIG_RPS
3648         if (static_key_false(&rps_needed)) {
3649                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3650                 int cpu;
3651
3652                 preempt_disable();
3653                 rcu_read_lock();
3654
3655                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3656                 if (cpu < 0)
3657                         cpu = smp_processor_id();
3658
3659                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3660
3661                 rcu_read_unlock();
3662                 preempt_enable();
3663         } else
3664 #endif
3665         {
3666                 unsigned int qtail;
3667                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3668                 put_cpu();
3669         }
3670         return ret;
3671 }
3672
3673 /**
3674  *      netif_rx        -       post buffer to the network code
3675  *      @skb: buffer to post
3676  *
3677  *      This function receives a packet from a device driver and queues it for
3678  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3679  *      may be dropped during processing for congestion control or by the
3680  *      protocol layers.
3681  *
3682  *      return values:
3683  *      NET_RX_SUCCESS  (no congestion)
3684  *      NET_RX_DROP     (packet was dropped)
3685  *
3686  */
3687
3688 int netif_rx(struct sk_buff *skb)
3689 {
3690         trace_netif_rx_entry(skb);
3691
3692         return netif_rx_internal(skb);
3693 }
3694 EXPORT_SYMBOL(netif_rx);
3695
3696 int netif_rx_ni(struct sk_buff *skb)
3697 {
3698         int err;
3699
3700         trace_netif_rx_ni_entry(skb);
3701
3702         preempt_disable();
3703         err = netif_rx_internal(skb);
3704         if (local_softirq_pending())
3705                 do_softirq();
3706         preempt_enable();
3707
3708         return err;
3709 }
3710 EXPORT_SYMBOL(netif_rx_ni);
3711
3712 static void net_tx_action(struct softirq_action *h)
3713 {
3714         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
3715
3716         if (sd->completion_queue) {
3717                 struct sk_buff *clist;
3718
3719                 local_irq_disable();
3720                 clist = sd->completion_queue;
3721                 sd->completion_queue = NULL;
3722                 local_irq_enable();
3723
3724                 while (clist) {
3725                         struct sk_buff *skb = clist;
3726                         clist = clist->next;
3727
3728                         WARN_ON(atomic_read(&skb->users));
3729                         if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
3730                                 trace_consume_skb(skb);
3731                         else
3732                                 trace_kfree_skb(skb, net_tx_action);
3733
3734                         if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
3735                                 __kfree_skb(skb);
3736                         else
3737                                 __kfree_skb_defer(skb);
3738                 }
3739
3740                 __kfree_skb_flush();
3741         }
3742
3743         if (sd->output_queue) {
3744                 struct Qdisc *head;
3745
3746                 local_irq_disable();
3747                 head = sd->output_queue;
3748                 sd->output_queue = NULL;
3749                 sd->output_queue_tailp = &sd->output_queue;
3750                 local_irq_enable();
3751
3752                 while (head) {
3753                         struct Qdisc *q = head;
3754                         spinlock_t *root_lock;
3755
3756                         head = head->next_sched;
3757
3758                         root_lock = qdisc_lock(q);
3759                         spin_lock(root_lock);
3760                         /* We need to make sure head->next_sched is read
3761                          * before clearing __QDISC_STATE_SCHED
3762                          */
3763                         smp_mb__before_atomic();
3764                         clear_bit(__QDISC_STATE_SCHED, &q->state);
3765                         qdisc_run(q);
3766                         spin_unlock(root_lock);
3767                 }
3768         }
3769 }
3770
3771 #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
3772 /* This hook is defined here for ATM LANE */
3773 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3774                              unsigned char *addr) __read_mostly;
3775 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3776 #endif
3777
3778 static inline struct sk_buff *
3779 sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
3780                    struct net_device *orig_dev)
3781 {
3782 #ifdef CONFIG_NET_CLS_ACT
3783         struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
3784         struct tcf_result cl_res;
3785
3786         /* If there's at least one ingress present somewhere (so
3787          * we get here via enabled static key), remaining devices
3788          * that are not configured with an ingress qdisc will bail
3789          * out here.
3790          */
3791         if (!cl)
3792                 return skb;
3793         if (*pt_prev) {
3794                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3795                 *pt_prev = NULL;
3796         }
3797
3798         qdisc_skb_cb(skb)->pkt_len = skb->len;
3799         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3800         qdisc_bstats_cpu_update(cl->q, skb);
3801
3802         switch (tc_classify(skb, cl, &cl_res, false)) {
3803         case TC_ACT_OK:
3804         case TC_ACT_RECLASSIFY:
3805                 skb->tc_index = TC_H_MIN(cl_res.classid);
3806                 break;
3807         case TC_ACT_SHOT:
3808                 qdisc_qstats_cpu_drop(cl->q);
3809                 kfree_skb(skb);
3810                 return NULL;
3811         case TC_ACT_STOLEN:
3812         case TC_ACT_QUEUED:
3813                 consume_skb(skb);
3814                 return NULL;
3815         case TC_ACT_REDIRECT:
3816                 /* skb_mac_header check was done by cls/act_bpf, so
3817                  * we can safely push the L2 header back before
3818                  * redirecting to another netdev
3819                  */
3820                 __skb_push(skb, skb->mac_len);
3821                 skb_do_redirect(skb);
3822                 return NULL;
3823         default:
3824                 break;
3825         }
3826 #endif /* CONFIG_NET_CLS_ACT */
3827         return skb;
3828 }
3829
3830 /**
3831  *      netdev_is_rx_handler_busy - check if receive handler is registered
3832  *      @dev: device to check
3833  *
3834  *      Check if a receive handler is already registered for a given device.
3835  *      Return true if there one.
3836  *
3837  *      The caller must hold the rtnl_mutex.
3838  */
3839 bool netdev_is_rx_handler_busy(struct net_device *dev)
3840 {
3841         ASSERT_RTNL();
3842         return dev && rtnl_dereference(dev->rx_handler);
3843 }
3844 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3845
3846 /**
3847  *      netdev_rx_handler_register - register receive handler
3848  *      @dev: device to register a handler for
3849  *      @rx_handler: receive handler to register
3850  *      @rx_handler_data: data pointer that is used by rx handler
3851  *
3852  *      Register a receive handler for a device. This handler will then be
3853  *      called from __netif_receive_skb. A negative errno code is returned
3854  *      on a failure.
3855  *
3856  *      The caller must hold the rtnl_mutex.
3857  *
3858  *      For a general description of rx_handler, see enum rx_handler_result.
3859  */
3860 int netdev_rx_handler_register(struct net_device *dev,
3861                                rx_handler_func_t *rx_handler,
3862                                void *rx_handler_data)
3863 {
3864         ASSERT_RTNL();
3865
3866         if (dev->rx_handler)
3867                 return -EBUSY;
3868
3869         /* Note: rx_handler_data must be set before rx_handler */
3870         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3871         rcu_assign_pointer(dev->rx_handler, rx_handler);
3872
3873         return 0;
3874 }
3875 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3876
3877 /**
3878  *      netdev_rx_handler_unregister - unregister receive handler
3879  *      @dev: device to unregister a handler from
3880  *
3881  *      Unregister a receive handler from a device.
3882  *
3883  *      The caller must hold the rtnl_mutex.
3884  */
3885 void netdev_rx_handler_unregister(struct net_device *dev)
3886 {
3887
3888         ASSERT_RTNL();
3889         RCU_INIT_POINTER(dev->rx_handler, NULL);
3890         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3891          * section has a guarantee to see a non NULL rx_handler_data
3892          * as well.
3893          */
3894         synchronize_net();
3895         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3896 }
3897 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3898
3899 /*
3900  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3901  * the special handling of PFMEMALLOC skbs.
3902  */
3903 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3904 {
3905         switch (skb->protocol) {
3906         case htons(ETH_P_ARP):
3907         case htons(ETH_P_IP):
3908         case htons(ETH_P_IPV6):
3909         case htons(ETH_P_8021Q):
3910         case htons(ETH_P_8021AD):
3911                 return true;
3912         default:
3913                 return false;
3914         }
3915 }
3916
3917 static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
3918                              int *ret, struct net_device *orig_dev)
3919 {
3920 #ifdef CONFIG_NETFILTER_INGRESS
3921         if (nf_hook_ingress_active(skb)) {
3922                 int ingress_retval;
3923
3924                 if (*pt_prev) {
3925                         *ret = deliver_skb(skb, *pt_prev, orig_dev);
3926                         *pt_prev = NULL;
3927                 }
3928
3929                 rcu_read_lock();
3930                 ingress_retval = nf_hook_ingress(skb);
3931                 rcu_read_unlock();
3932                 return ingress_retval;
3933         }
3934 #endif /* CONFIG_NETFILTER_INGRESS */
3935         return 0;
3936 }
3937
3938 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3939 {
3940         struct packet_type *ptype, *pt_prev;
3941         rx_handler_func_t *rx_handler;
3942         struct net_device *orig_dev;
3943         bool deliver_exact = false;
3944         int ret = NET_RX_DROP;
3945         __be16 type;
3946
3947         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3948
3949         trace_netif_receive_skb(skb);
3950
3951         orig_dev = skb->dev;
3952
3953         skb_reset_network_header(skb);
3954         if (!skb_transport_header_was_set(skb))
3955                 skb_reset_transport_header(skb);
3956         skb_reset_mac_len(skb);
3957
3958         pt_prev = NULL;
3959
3960 another_round:
3961         skb->skb_iif = skb->dev->ifindex;
3962
3963         __this_cpu_inc(softnet_data.processed);
3964
3965         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3966             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3967                 skb = skb_vlan_untag(skb);
3968                 if (unlikely(!skb))
3969                         goto out;
3970         }
3971
3972 #ifdef CONFIG_NET_CLS_ACT
3973         if (skb->tc_verd & TC_NCLS) {
3974                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3975                 goto ncls;
3976         }
3977 #endif
3978
3979         if (pfmemalloc)
3980                 goto skip_taps;
3981
3982         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3983                 if (pt_prev)
3984                         ret = deliver_skb(skb, pt_prev, orig_dev);
3985                 pt_prev = ptype;
3986         }
3987
3988         list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
3989                 if (pt_prev)
3990                         ret = deliver_skb(skb, pt_prev, orig_dev);
3991                 pt_prev = ptype;
3992         }
3993
3994 skip_taps:
3995 #ifdef CONFIG_NET_INGRESS
3996         if (static_key_false(&ingress_needed)) {
3997                 skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
3998                 if (!skb)
3999                         goto out;
4000
4001                 if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
4002                         goto out;
4003         }
4004 #endif
4005 #ifdef CONFIG_NET_CLS_ACT
4006         skb->tc_verd = 0;
4007 ncls:
4008 #endif
4009         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
4010                 goto drop;
4011
4012         if (skb_vlan_tag_present(skb)) {
4013                 if (pt_prev) {
4014                         ret = deliver_skb(skb, pt_prev, orig_dev);
4015                         pt_prev = NULL;
4016                 }
4017                 if (vlan_do_receive(&skb))
4018                         goto another_round;
4019                 else if (unlikely(!skb))
4020                         goto out;
4021         }
4022
4023         rx_handler = rcu_dereference(skb->dev->rx_handler);
4024         if (rx_handler) {
4025                 if (pt_prev) {
4026                         ret = deliver_skb(skb, pt_prev, orig_dev);
4027                         pt_prev = NULL;
4028                 }
4029                 switch (rx_handler(&skb)) {
4030                 case RX_HANDLER_CONSUMED:
4031                         ret = NET_RX_SUCCESS;
4032                         goto out;
4033                 case RX_HANDLER_ANOTHER:
4034                         goto another_round;
4035                 case RX_HANDLER_EXACT:
4036                         deliver_exact = true;
4037                 case RX_HANDLER_PASS:
4038                         break;
4039                 default:
4040                         BUG();
4041                 }
4042         }
4043
4044         if (unlikely(skb_vlan_tag_present(skb))) {
4045                 if (skb_vlan_tag_get_id(skb))
4046                         skb->pkt_type = PACKET_OTHERHOST;
4047                 /* Note: we might in the future use prio bits
4048                  * and set skb->priority like in vlan_do_receive()
4049                  * For the time being, just ignore Priority Code Point
4050                  */
4051                 skb->vlan_tci = 0;
4052         }
4053
4054         type = skb->protocol;
4055
4056         /* deliver only exact match when indicated */
4057         if (likely(!deliver_exact)) {
4058                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4059                                        &ptype_base[ntohs(type) &
4060                                                    PTYPE_HASH_MASK]);
4061         }
4062
4063         deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4064                                &orig_dev->ptype_specific);
4065
4066         if (unlikely(skb->dev != orig_dev)) {
4067                 deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
4068                                        &skb->dev->ptype_specific);
4069         }
4070
4071         if (pt_prev) {
4072                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
4073                         goto drop;
4074                 else
4075                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
4076         } else {
4077 drop:
4078                 if (!deliver_exact)
4079                         atomic_long_inc(&skb->dev->rx_dropped);
4080                 else
4081                         atomic_long_inc(&skb->dev->rx_nohandler);
4082                 kfree_skb(skb);
4083                 /* Jamal, now you will not able to escape explaining
4084                  * me how you were going to use this. :-)
4085                  */
4086                 ret = NET_RX_DROP;
4087         }
4088
4089 out:
4090         return ret;
4091 }
4092
4093 static int __netif_receive_skb(struct sk_buff *skb)
4094 {
4095         int ret;
4096
4097         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
4098                 unsigned long pflags = current->flags;
4099
4100                 /*
4101                  * PFMEMALLOC skbs are special, they should
4102                  * - be delivered to SOCK_MEMALLOC sockets only
4103                  * - stay away from userspace
4104                  * - have bounded memory usage
4105                  *
4106                  * Use PF_MEMALLOC as this saves us from propagating the allocation
4107                  * context down to all allocation sites.
4108                  */
4109                 current->flags |= PF_MEMALLOC;
4110                 ret = __netif_receive_skb_core(skb, true);
4111                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
4112         } else
4113                 ret = __netif_receive_skb_core(skb, false);
4114
4115         return ret;
4116 }
4117
4118 static int netif_receive_skb_internal(struct sk_buff *skb)
4119 {
4120         int ret;
4121
4122         net_timestamp_check(netdev_tstamp_prequeue, skb);
4123
4124         if (skb_defer_rx_timestamp(skb))
4125                 return NET_RX_SUCCESS;
4126
4127         rcu_read_lock();
4128
4129 #ifdef CONFIG_RPS
4130         if (static_key_false(&rps_needed)) {
4131                 struct rps_dev_flow voidflow, *rflow = &voidflow;
4132                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
4133
4134                 if (cpu >= 0) {
4135                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4136                         rcu_read_unlock();
4137                         return ret;
4138                 }
4139         }
4140 #endif
4141         ret = __netif_receive_skb(skb);
4142         rcu_read_unlock();
4143         return ret;
4144 }
4145
4146 /**
4147  *      netif_receive_skb - process receive buffer from network
4148  *      @skb: buffer to process
4149  *
4150  *      netif_receive_skb() is the main receive data processing function.
4151  *      It always succeeds. The buffer may be dropped during processing
4152  *      for congestion control or by the protocol layers.
4153  *
4154  *      This function may only be called from softirq context and interrupts
4155  *      should be enabled.
4156  *
4157  *      Return values (usually ignored):
4158  *      NET_RX_SUCCESS: no congestion
4159  *      NET_RX_DROP: packet was dropped
4160  */
4161 int netif_receive_skb(struct sk_buff *skb)
4162 {
4163         trace_netif_receive_skb_entry(skb);
4164
4165         return netif_receive_skb_internal(skb);
4166 }
4167 EXPORT_SYMBOL(netif_receive_skb);
4168
4169 DEFINE_PER_CPU(struct work_struct, flush_works);
4170
4171 /* Network device is going away, flush any packets still pending */
4172 static void flush_backlog(struct work_struct *work)
4173 {
4174         struct sk_buff *skb, *tmp;
4175         struct softnet_data *sd;
4176
4177         local_bh_disable();
4178         sd = this_cpu_ptr(&softnet_data);
4179
4180         local_irq_disable();
4181         rps_lock(sd);
4182         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
4183                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4184                         __skb_unlink(skb, &sd->input_pkt_queue);
4185                         kfree_skb(skb);
4186                         input_queue_head_incr(sd);
4187                 }
4188         }
4189         rps_unlock(sd);
4190         local_irq_enable();
4191
4192         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
4193                 if (skb->dev->reg_state == NETREG_UNREGISTERING) {
4194                         __skb_unlink(skb, &sd->process_queue);
4195                         kfree_skb(skb);
4196                         input_queue_head_incr(sd);
4197                 }
4198         }
4199         local_bh_enable();
4200 }
4201
4202 static void flush_all_backlogs(void)
4203 {
4204         unsigned int cpu;
4205
4206         get_online_cpus();
4207
4208         for_each_online_cpu(cpu)
4209                 queue_work_on(cpu, system_highpri_wq,
4210                               per_cpu_ptr(&flush_works, cpu));
4211
4212         for_each_online_cpu(cpu)
4213                 flush_work(per_cpu_ptr(&flush_works, cpu));
4214
4215         put_online_cpus();
4216 }
4217
4218 static int napi_gro_complete(struct sk_buff *skb)
4219 {
4220         struct packet_offload *ptype;
4221         __be16 type = skb->protocol;
4222         struct list_head *head = &offload_base;
4223         int err = -ENOENT;
4224
4225         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
4226
4227         if (NAPI_GRO_CB(skb)->count == 1) {
4228                 skb_shinfo(skb)->gso_size = 0;
4229                 goto out;
4230         }
4231
4232         rcu_read_lock();
4233         list_for_each_entry_rcu(ptype, head, list) {
4234                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4235                         continue;
4236
4237                 err = ptype->callbacks.gro_complete(skb, 0);
4238                 break;
4239         }
4240         rcu_read_unlock();
4241
4242         if (err) {
4243                 WARN_ON(&ptype->list == head);
4244                 kfree_skb(skb);
4245                 return NET_RX_SUCCESS;
4246         }
4247
4248 out:
4249         return netif_receive_skb_internal(skb);
4250 }
4251
4252 /* napi->gro_list contains packets ordered by age.
4253  * youngest packets at the head of it.
4254  * Complete skbs in reverse order to reduce latencies.
4255  */
4256 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
4257 {
4258         struct sk_buff *skb, *prev = NULL;
4259
4260         /* scan list and build reverse chain */
4261         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
4262                 skb->prev = prev;
4263                 prev = skb;
4264         }
4265
4266         for (skb = prev; skb; skb = prev) {
4267                 skb->next = NULL;
4268
4269                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
4270                         return;
4271
4272                 prev = skb->prev;
4273                 napi_gro_complete(skb);
4274                 napi->gro_count--;
4275         }
4276
4277         napi->gro_list = NULL;
4278 }
4279 EXPORT_SYMBOL(napi_gro_flush);
4280
4281 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
4282 {
4283         struct sk_buff *p;
4284         unsigned int maclen = skb->dev->hard_header_len;
4285         u32 hash = skb_get_hash_raw(skb);
4286
4287         for (p = napi->gro_list; p; p = p->next) {
4288                 unsigned long diffs;
4289
4290                 NAPI_GRO_CB(p)->flush = 0;
4291
4292                 if (hash != skb_get_hash_raw(p)) {
4293                         NAPI_GRO_CB(p)->same_flow = 0;
4294                         continue;
4295                 }
4296
4297                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
4298                 diffs |= p->vlan_tci ^ skb->vlan_tci;
4299                 diffs |= skb_metadata_dst_cmp(p, skb);
4300                 if (maclen == ETH_HLEN)
4301                         diffs |= compare_ether_header(skb_mac_header(p),
4302                                                       skb_mac_header(skb));
4303                 else if (!diffs)
4304                         diffs = memcmp(skb_mac_header(p),
4305                                        skb_mac_header(skb),
4306                                        maclen);
4307                 NAPI_GRO_CB(p)->same_flow = !diffs;
4308         }
4309 }
4310
4311 static void skb_gro_reset_offset(struct sk_buff *skb)
4312 {
4313         const struct skb_shared_info *pinfo = skb_shinfo(skb);
4314         const skb_frag_t *frag0 = &pinfo->frags[0];
4315
4316         NAPI_GRO_CB(skb)->data_offset = 0;
4317         NAPI_GRO_CB(skb)->frag0 = NULL;
4318         NAPI_GRO_CB(skb)->frag0_len = 0;
4319
4320         if (skb_mac_header(skb) == skb_tail_pointer(skb) &&
4321             pinfo->nr_frags &&
4322             !PageHighMem(skb_frag_page(frag0))) {
4323                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
4324                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
4325         }
4326 }
4327
4328 static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
4329 {
4330         struct skb_shared_info *pinfo = skb_shinfo(skb);
4331
4332         BUG_ON(skb->end - skb->tail < grow);
4333
4334         memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
4335
4336         skb->data_len -= grow;
4337         skb->tail += grow;
4338
4339         pinfo->frags[0].page_offset += grow;
4340         skb_frag_size_sub(&pinfo->frags[0], grow);
4341
4342         if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
4343                 skb_frag_unref(skb, 0);
4344                 memmove(pinfo->frags, pinfo->frags + 1,
4345                         --pinfo->nr_frags * sizeof(pinfo->frags[0]));
4346         }
4347 }
4348
4349 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4350 {
4351         struct sk_buff **pp = NULL;
4352         struct packet_offload *ptype;
4353         __be16 type = skb->protocol;
4354         struct list_head *head = &offload_base;
4355         int same_flow;
4356         enum gro_result ret;
4357         int grow;
4358
4359         if (!(skb->dev->features & NETIF_F_GRO))
4360                 goto normal;
4361
4362         if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
4363                 goto normal;
4364
4365         gro_list_prepare(napi, skb);
4366
4367         rcu_read_lock();
4368         list_for_each_entry_rcu(ptype, head, list) {
4369                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4370                         continue;
4371
4372                 skb_set_network_header(skb, skb_gro_offset(skb));
4373                 skb_reset_mac_len(skb);
4374                 NAPI_GRO_CB(skb)->same_flow = 0;
4375                 NAPI_GRO_CB(skb)->flush = 0;
4376                 NAPI_GRO_CB(skb)->free = 0;
4377                 NAPI_GRO_CB(skb)->encap_mark = 0;
4378                 NAPI_GRO_CB(skb)->is_fou = 0;
4379                 NAPI_GRO_CB(skb)->is_atomic = 1;
4380                 NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
4381
4382                 /* Setup for GRO checksum validation */
4383                 switch (skb->ip_summed) {
4384                 case CHECKSUM_COMPLETE:
4385                         NAPI_GRO_CB(skb)->csum = skb->csum;
4386                         NAPI_GRO_CB(skb)->csum_valid = 1;
4387                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4388                         break;
4389                 case CHECKSUM_UNNECESSARY:
4390                         NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
4391                         NAPI_GRO_CB(skb)->csum_valid = 0;
4392                         break;
4393                 default:
4394                         NAPI_GRO_CB(skb)->csum_cnt = 0;
4395                         NAPI_GRO_CB(skb)->csum_valid = 0;
4396                 }
4397
4398                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
4399                 break;
4400         }
4401         rcu_read_unlock();
4402
4403         if (&ptype->list == head)
4404                 goto normal;
4405
4406         same_flow = NAPI_GRO_CB(skb)->same_flow;
4407         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
4408
4409         if (pp) {
4410                 struct sk_buff *nskb = *pp;
4411
4412                 *pp = nskb->next;
4413                 nskb->next = NULL;
4414                 napi_gro_complete(nskb);
4415                 napi->gro_count--;
4416         }
4417
4418         if (same_flow)
4419                 goto ok;
4420
4421         if (NAPI_GRO_CB(skb)->flush)
4422                 goto normal;
4423
4424         if (unlikely(napi->gro_count >= MAX_GRO_SKBS)) {
4425                 struct sk_buff *nskb = napi->gro_list;
4426
4427                 /* locate the end of the list to select the 'oldest' flow */
4428                 while (nskb->next) {
4429                         pp = &nskb->next;
4430                         nskb = *pp;
4431                 }
4432                 *pp = NULL;
4433                 nskb->next = NULL;
4434                 napi_gro_complete(nskb);
4435         } else {
4436                 napi->gro_count++;
4437         }
4438         NAPI_GRO_CB(skb)->count = 1;
4439         NAPI_GRO_CB(skb)->age = jiffies;
4440         NAPI_GRO_CB(skb)->last = skb;
4441         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
4442         skb->next = napi->gro_list;
4443         napi->gro_list = skb;
4444         ret = GRO_HELD;
4445
4446 pull:
4447         grow = skb_gro_offset(skb) - skb_headlen(skb);
4448         if (grow > 0)
4449                 gro_pull_from_frag0(skb, grow);
4450 ok:
4451         return ret;
4452
4453 normal:
4454         ret = GRO_NORMAL;
4455         goto pull;
4456 }
4457
4458 struct packet_offload *gro_find_receive_by_type(__be16 type)
4459 {
4460         struct list_head *offload_head = &offload_base;
4461         struct packet_offload *ptype;
4462
4463         list_for_each_entry_rcu(ptype, offload_head, list) {
4464                 if (ptype->type != type || !ptype->callbacks.gro_receive)
4465                         continue;
4466                 return ptype;
4467         }
4468         return NULL;
4469 }
4470 EXPORT_SYMBOL(gro_find_receive_by_type);
4471
4472 struct packet_offload *gro_find_complete_by_type(__be16 type)
4473 {
4474         struct list_head *offload_head = &offload_base;
4475         struct packet_offload *ptype;
4476
4477         list_for_each_entry_rcu(ptype, offload_head, list) {
4478                 if (ptype->type != type || !ptype->callbacks.gro_complete)
4479                         continue;
4480                 return ptype;
4481         }
4482         return NULL;
4483 }
4484 EXPORT_SYMBOL(gro_find_complete_by_type);
4485
4486 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
4487 {
4488         switch (ret) {
4489         case GRO_NORMAL:
4490                 if (netif_receive_skb_internal(skb))
4491                         ret = GRO_DROP;
4492                 break;
4493
4494         case GRO_DROP:
4495                 kfree_skb(skb);
4496                 break;
4497
4498         case GRO_MERGED_FREE:
4499                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
4500                         skb_dst_drop(skb);
4501                         kmem_cache_free(skbuff_head_cache, skb);
4502                 } else {
4503                         __kfree_skb(skb);
4504                 }
4505                 break;
4506
4507         case GRO_HELD:
4508         case GRO_MERGED:
4509                 break;
4510         }
4511
4512         return ret;
4513 }
4514
4515 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
4516 {
4517         skb_mark_napi_id(skb, napi);
4518         trace_napi_gro_receive_entry(skb);
4519
4520         skb_gro_reset_offset(skb);
4521
4522         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
4523 }
4524 EXPORT_SYMBOL(napi_gro_receive);
4525
4526 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
4527 {
4528         if (unlikely(skb->pfmemalloc)) {
4529                 consume_skb(skb);
4530                 return;
4531         }
4532         __skb_pull(skb, skb_headlen(skb));
4533         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
4534         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
4535         skb->vlan_tci = 0;
4536         skb->dev = napi->dev;
4537         skb->skb_iif = 0;
4538         skb->encapsulation = 0;
4539         skb_shinfo(skb)->gso_type = 0;
4540         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
4541
4542         napi->skb = skb;
4543 }
4544
4545 struct sk_buff *napi_get_frags(struct napi_struct *napi)
4546 {
4547         struct sk_buff *skb = napi->skb;
4548
4549         if (!skb) {
4550                 skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
4551                 if (skb) {
4552                         napi->skb = skb;
4553                         skb_mark_napi_id(skb, napi);
4554                 }
4555         }
4556         return skb;
4557 }
4558 EXPORT_SYMBOL(napi_get_frags);
4559
4560 static gro_result_t napi_frags_finish(struct napi_struct *napi,
4561                                       struct sk_buff *skb,
4562                                       gro_result_t ret)
4563 {
4564         switch (ret) {
4565         case GRO_NORMAL:
4566         case GRO_HELD:
4567                 __skb_push(skb, ETH_HLEN);
4568                 skb->protocol = eth_type_trans(skb, skb->dev);
4569                 if (ret == GRO_NORMAL && netif_receive_skb_internal(skb))
4570                         ret = GRO_DROP;
4571                 break;
4572
4573         case GRO_DROP:
4574         case GRO_MERGED_FREE:
4575                 napi_reuse_skb(napi, skb);
4576                 break;
4577
4578         case GRO_MERGED:
4579                 break;
4580         }
4581
4582         return ret;
4583 }
4584
4585 /* Upper GRO stack assumes network header starts at gro_offset=0
4586  * Drivers could call both napi_gro_frags() and napi_gro_receive()
4587  * We copy ethernet header into skb->data to have a common layout.
4588  */
4589 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4590 {
4591         struct sk_buff *skb = napi->skb;
4592         const struct ethhdr *eth;
4593         unsigned int hlen = sizeof(*eth);
4594
4595         napi->skb = NULL;
4596
4597         skb_reset_mac_header(skb);
4598         skb_gro_reset_offset(skb);
4599
4600         eth = skb_gro_header_fast(skb, 0);
4601         if (unlikely(skb_gro_header_hard(skb, hlen))) {
4602                 eth = skb_gro_header_slow(skb, hlen, 0);
4603                 if (unlikely(!eth)) {
4604                         net_warn_ratelimited("%s: dropping impossible skb from %s\n",
4605                                              __func__, napi->dev->name);
4606                         napi_reuse_skb(napi, skb);
4607                         return NULL;
4608                 }
4609         } else {
4610                 gro_pull_from_frag0(skb, hlen);
4611                 NAPI_GRO_CB(skb)->frag0 += hlen;
4612                 NAPI_GRO_CB(skb)->frag0_len -= hlen;
4613         }
4614         __skb_pull(skb, hlen);
4615
4616         /*
4617          * This works because the only protocols we care about don't require
4618          * special handling.
4619          * We'll fix it up properly in napi_frags_finish()
4620          */
4621         skb->protocol = eth->h_proto;
4622
4623         return skb;
4624 }
4625
4626 gro_result_t napi_gro_frags(struct napi_struct *napi)
4627 {
4628         struct sk_buff *skb = napi_frags_skb(napi);
4629
4630         if (!skb)
4631                 return GRO_DROP;
4632
4633         trace_napi_gro_frags_entry(skb);
4634
4635         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4636 }
4637 EXPORT_SYMBOL(napi_gro_frags);
4638
4639 /* Compute the checksum from gro_offset and return the folded value
4640  * after adding in any pseudo checksum.
4641  */
4642 __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
4643 {
4644         __wsum wsum;
4645         __sum16 sum;
4646
4647         wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
4648
4649         /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
4650         sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
4651         if (likely(!sum)) {
4652                 if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
4653                     !skb->csum_complete_sw)
4654                         netdev_rx_csum_fault(skb->dev);
4655         }
4656
4657         NAPI_GRO_CB(skb)->csum = wsum;
4658         NAPI_GRO_CB(skb)->csum_valid = 1;
4659
4660         return sum;
4661 }
4662 EXPORT_SYMBOL(__skb_gro_checksum_complete);
4663
4664 /*
4665  * net_rps_action_and_irq_enable sends any pending IPI's for rps.
4666  * Note: called with local irq disabled, but exits with local irq enabled.
4667  */
4668 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4669 {
4670 #ifdef CONFIG_RPS
4671         struct softnet_data *remsd = sd->rps_ipi_list;
4672
4673         if (remsd) {
4674                 sd->rps_ipi_list = NULL;
4675
4676                 local_irq_enable();
4677
4678                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4679                 while (remsd) {
4680                         struct softnet_data *next = remsd->rps_ipi_next;
4681
4682                         if (cpu_online(remsd->cpu))
4683                                 smp_call_function_single_async(remsd->cpu,
4684                                                            &remsd->csd);
4685                         remsd = next;
4686                 }
4687         } else
4688 #endif
4689                 local_irq_enable();
4690 }
4691
4692 static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
4693 {
4694 #ifdef CONFIG_RPS
4695         return sd->rps_ipi_list != NULL;
4696 #else
4697         return false;
4698 #endif
4699 }
4700
4701 static int process_backlog(struct napi_struct *napi, int quota)
4702 {
4703         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4704         bool again = true;
4705         int work = 0;
4706
4707         /* Check if we have pending ipi, its better to send them now,
4708          * not waiting net_rx_action() end.
4709          */
4710         if (sd_has_rps_ipi_waiting(sd)) {
4711                 local_irq_disable();
4712                 net_rps_action_and_irq_enable(sd);
4713         }
4714
4715         napi->weight = weight_p;
4716         while (again) {
4717                 struct sk_buff *skb;
4718
4719                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4720                         rcu_read_lock();
4721                         __netif_receive_skb(skb);
4722                         rcu_read_unlock();
4723                         input_queue_head_incr(sd);
4724                         if (++work >= quota)
4725                                 return work;
4726
4727                 }
4728
4729                 local_irq_disable();
4730                 rps_lock(sd);
4731                 if (skb_queue_empty(&sd->input_pkt_queue)) {
4732                         /*
4733                          * Inline a custom version of __napi_complete().
4734                          * only current cpu owns and manipulates this napi,
4735                          * and NAPI_STATE_SCHED is the only possible flag set
4736                          * on backlog.
4737                          * We can use a plain write instead of clear_bit(),
4738                          * and we dont need an smp_mb() memory barrier.
4739                          */
4740                         napi->state = 0;
4741                         again = false;
4742                 } else {
4743                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4744                                                    &sd->process_queue);
4745                 }
4746                 rps_unlock(sd);
4747                 local_irq_enable();
4748         }
4749
4750         return work;
4751 }
4752
4753 /**
4754  * __napi_schedule - schedule for receive
4755  * @n: entry to schedule
4756  *
4757  * The entry's receive function will be scheduled to run.
4758  * Consider using __napi_schedule_irqoff() if hard irqs are masked.
4759  */
4760 void __napi_schedule(struct napi_struct *n)
4761 {
4762         unsigned long flags;
4763
4764         local_irq_save(flags);
4765         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4766         local_irq_restore(flags);
4767 }
4768 EXPORT_SYMBOL(__napi_schedule);
4769
4770 /**
4771  * __napi_schedule_irqoff - schedule for receive
4772  * @n: entry to schedule
4773  *
4774  * Variant of __napi_schedule() assuming hard irqs are masked
4775  */
4776 void __napi_schedule_irqoff(struct napi_struct *n)
4777 {
4778         ____napi_schedule(this_cpu_ptr(&softnet_data), n);
4779 }
4780 EXPORT_SYMBOL(__napi_schedule_irqoff);
4781
4782 void __napi_complete(struct napi_struct *n)
4783 {
4784         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4785
4786         list_del_init(&n->poll_list);
4787         smp_mb__before_atomic();
4788         clear_bit(NAPI_STATE_SCHED, &n->state);
4789 }
4790 EXPORT_SYMBOL(__napi_complete);
4791
4792 void napi_complete_done(struct napi_struct *n, int work_done)
4793 {
4794         unsigned long flags;
4795
4796         /*
4797          * don't let napi dequeue from the cpu poll list
4798          * just in case its running on a different cpu
4799          */
4800         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4801                 return;
4802
4803         if (n->gro_list) {
4804                 unsigned long timeout = 0;
4805
4806                 if (work_done)
4807                         timeout = n->dev->gro_flush_timeout;
4808
4809                 if (timeout)
4810                         hrtimer_start(&n->timer, ns_to_ktime(timeout),
4811                                       HRTIMER_MODE_REL_PINNED);
4812                 else
4813                         napi_gro_flush(n, false);
4814         }
4815         if (likely(list_empty(&n->poll_list))) {
4816                 WARN_ON_ONCE(!test_and_clear_bit(NAPI_STATE_SCHED, &n->state));
4817         } else {
4818                 /* If n->poll_list is not empty, we need to mask irqs */
4819                 local_irq_save(flags);
4820                 __napi_complete(n);
4821                 local_irq_restore(flags);
4822         }
4823 }
4824 EXPORT_SYMBOL(napi_complete_done);
4825
4826 /* must be called under rcu_read_lock(), as we dont take a reference */
4827 static struct napi_struct *napi_by_id(unsigned int napi_id)
4828 {
4829         unsigned int hash = napi_id % HASH_SIZE(napi_hash);
4830         struct napi_struct *napi;
4831
4832         hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
4833                 if (napi->napi_id == napi_id)
4834                         return napi;
4835
4836         return NULL;
4837 }
4838
4839 #if defined(CONFIG_NET_RX_BUSY_POLL)
4840 #define BUSY_POLL_BUDGET 8
4841 bool sk_busy_loop(struct sock *sk, int nonblock)
4842 {
4843         unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
4844         int (*busy_poll)(struct napi_struct *dev);
4845         struct napi_struct *napi;
4846         int rc = false;
4847
4848         rcu_read_lock();
4849
4850         napi = napi_by_id(sk->sk_napi_id);
4851         if (!napi)
4852                 goto out;
4853
4854         /* Note: ndo_busy_poll method is optional in linux-4.5 */
4855         busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
4856
4857         do {
4858                 rc = 0;
4859                 local_bh_disable();
4860                 if (busy_poll) {
4861                         rc = busy_poll(napi);
4862                 } else if (napi_schedule_prep(napi)) {
4863                         void *have = netpoll_poll_lock(napi);
4864
4865                         if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
4866                                 rc = napi->poll(napi, BUSY_POLL_BUDGET);
4867                                 trace_napi_poll(napi, rc, BUSY_POLL_BUDGET);
4868                                 if (rc == BUSY_POLL_BUDGET) {
4869                                         napi_complete_done(napi, rc);
4870                                         napi_schedule(napi);
4871                                 }
4872                         }
4873                         netpoll_poll_unlock(have);
4874                 }
4875                 if (rc > 0)
4876                         __NET_ADD_STATS(sock_net(sk),
4877                                         LINUX_MIB_BUSYPOLLRXPACKETS, rc);
4878                 local_bh_enable();
4879
4880                 if (rc == LL_FLUSH_FAILED)
4881                         break; /* permanent failure */
4882
4883                 cpu_relax();
4884         } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
4885                  !need_resched() && !busy_loop_timeout(end_time));
4886
4887         rc = !skb_queue_empty(&sk->sk_receive_queue);
4888 out:
4889         rcu_read_unlock();
4890         return rc;
4891 }
4892 EXPORT_SYMBOL(sk_busy_loop);
4893
4894 #endif /* CONFIG_NET_RX_BUSY_POLL */
4895
4896 void napi_hash_add(struct napi_struct *napi)
4897 {
4898         if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
4899             test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
4900                 return;
4901
4902         spin_lock(&napi_hash_lock);
4903
4904         /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
4905         do {
4906                 if (unlikely(++napi_gen_id < NR_CPUS + 1))
4907                         napi_gen_id = NR_CPUS + 1;
4908         } while (napi_by_id(napi_gen_id));
4909         napi->napi_id = napi_gen_id;
4910
4911         hlist_add_head_rcu(&napi->napi_hash_node,
4912                            &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
4913
4914         spin_unlock(&napi_hash_lock);
4915 }
4916 EXPORT_SYMBOL_GPL(napi_hash_add);
4917
4918 /* Warning : caller is responsible to make sure rcu grace period
4919  * is respected before freeing memory containing @napi
4920  */
4921 bool napi_hash_del(struct napi_struct *napi)
4922 {
4923         bool rcu_sync_needed = false;
4924
4925         spin_lock(&napi_hash_lock);
4926
4927         if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
4928                 rcu_sync_needed = true;
4929                 hlist_del_rcu(&napi->napi_hash_node);
4930         }
4931         spin_unlock(&napi_hash_lock);
4932         return rcu_sync_needed;
4933 }
4934 EXPORT_SYMBOL_GPL(napi_hash_del);
4935
4936 static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
4937 {
4938         struct napi_struct *napi;
4939
4940         napi = container_of(timer, struct napi_struct, timer);
4941         if (napi->gro_list)
4942                 napi_schedule(napi);
4943
4944         return HRTIMER_NORESTART;
4945 }
4946
4947 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4948                     int (*poll)(struct napi_struct *, int), int weight)
4949 {
4950         INIT_LIST_HEAD(&napi->poll_list);
4951         hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
4952         napi->timer.function = napi_watchdog;
4953         napi->gro_count = 0;
4954         napi->gro_list = NULL;
4955         napi->skb = NULL;
4956         napi->poll = poll;
4957         if (weight > NAPI_POLL_WEIGHT)
4958                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4959                             weight, dev->name);
4960         napi->weight = weight;
4961         list_add(&napi->dev_list, &dev->napi_list);
4962         napi->dev = dev;
4963 #ifdef CONFIG_NETPOLL
4964         spin_lock_init(&napi->poll_lock);
4965         napi->poll_owner = -1;
4966 #endif
4967         set_bit(NAPI_STATE_SCHED, &napi->state);
4968         napi_hash_add(napi);
4969 }
4970 EXPORT_SYMBOL(netif_napi_add);
4971
4972 void napi_disable(struct napi_struct *n)
4973 {
4974         might_sleep();
4975         set_bit(NAPI_STATE_DISABLE, &n->state);
4976
4977         while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
4978                 msleep(1);
4979         while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
4980                 msleep(1);
4981
4982         hrtimer_cancel(&n->timer);
4983
4984         clear_bit(NAPI_STATE_DISABLE, &n->state);
4985 }
4986 EXPORT_SYMBOL(napi_disable);
4987
4988 /* Must be called in process context */
4989 void netif_napi_del(struct napi_struct *napi)
4990 {
4991         might_sleep();
4992         if (napi_hash_del(napi))
4993                 synchronize_net();
4994         list_del_init(&napi->dev_list);
4995         napi_free_frags(napi);
4996
4997         kfree_skb_list(napi->gro_list);
4998         napi->gro_list = NULL;
4999         napi->gro_count = 0;
5000 }
5001 EXPORT_SYMBOL(netif_napi_del);
5002
5003 static int napi_poll(struct napi_struct *n, struct list_head *repoll)
5004 {
5005         void *have;
5006         int work, weight;
5007
5008         list_del_init(&n->poll_list);
5009
5010         have = netpoll_poll_lock(n);
5011
5012         weight = n->weight;
5013
5014         /* This NAPI_STATE_SCHED test is for avoiding a race
5015          * with netpoll's poll_napi().  Only the entity which
5016          * obtains the lock and sees NAPI_STATE_SCHED set will
5017          * actually make the ->poll() call.  Therefore we avoid
5018          * accidentally calling ->poll() when NAPI is not scheduled.
5019          */
5020         work = 0;
5021         if (test_bit(NAPI_STATE_SCHED, &n->state)) {
5022                 work = n->poll(n, weight);
5023                 trace_napi_poll(n, work, weight);
5024         }
5025
5026         WARN_ON_ONCE(work > weight);
5027
5028         if (likely(work < weight))
5029                 goto out_unlock;
5030
5031         /* Drivers must not modify the NAPI state if they
5032          * consume the entire weight.  In such cases this code
5033          * still "owns" the NAPI instance and therefore can
5034          * move the instance around on the list at-will.
5035          */
5036         if (unlikely(napi_disable_pending(n))) {
5037                 napi_complete(n);
5038                 goto out_unlock;
5039         }
5040
5041         if (n->gro_list) {
5042                 /* flush too old packets
5043                  * If HZ < 1000, flush all packets.
5044                  */
5045                 napi_gro_flush(n, HZ >= 1000);
5046         }
5047
5048         /* Some drivers may have called napi_schedule
5049          * prior to exhausting their budget.
5050          */
5051         if (unlikely(!list_empty(&n->poll_list))) {
5052                 pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
5053                              n->dev ? n->dev->name : "backlog");
5054                 goto out_unlock;
5055         }
5056
5057         list_add_tail(&n->poll_list, repoll);
5058
5059 out_unlock:
5060         netpoll_poll_unlock(have);
5061
5062         return work;
5063 }
5064
5065 static void net_rx_action(struct softirq_action *h)
5066 {
5067         struct softnet_data *sd = this_cpu_ptr(&softnet_data);
5068         unsigned long time_limit = jiffies + 2;
5069         int budget = netdev_budget;
5070         LIST_HEAD(list);
5071         LIST_HEAD(repoll);
5072
5073         local_irq_disable();
5074         list_splice_init(&sd->poll_list, &list);
5075         local_irq_enable();
5076
5077         for (;;) {
5078                 struct napi_struct *n;
5079
5080                 if (list_empty(&list)) {
5081                         if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
5082                                 return;
5083                         break;
5084                 }
5085
5086                 n = list_first_entry(&list, struct napi_struct, poll_list);
5087                 budget -= napi_poll(n, &repoll);
5088
5089                 /* If softirq window is exhausted then punt.
5090                  * Allow this to run for 2 jiffies since which will allow
5091                  * an average latency of 1.5/HZ.
5092                  */
5093                 if (unlikely(budget <= 0 ||
5094                              time_after_eq(jiffies, time_limit))) {
5095                         sd->time_squeeze++;
5096                         break;
5097                 }
5098         }
5099
5100         __kfree_skb_flush();
5101         local_irq_disable();
5102
5103         list_splice_tail_init(&sd->poll_list, &list);
5104         list_splice_tail(&repoll, &list);
5105         list_splice(&list, &sd->poll_list);
5106         if (!list_empty(&sd->poll_list))
5107                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
5108
5109         net_rps_action_and_irq_enable(sd);
5110 }
5111
5112 struct netdev_adjacent {
5113         struct net_device *dev;
5114
5115         /* upper master flag, there can only be one master device per list */
5116         bool master;
5117
5118         /* counter for the number of times this device was added to us */
5119         u16 ref_nr;
5120
5121         /* private field for the users */
5122         void *private;
5123
5124         struct list_head list;
5125         struct rcu_head rcu;
5126 };
5127
5128 static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
5129                                                  struct list_head *adj_list)
5130 {
5131         struct netdev_adjacent *adj;
5132
5133         list_for_each_entry(adj, adj_list, list) {
5134                 if (adj->dev == adj_dev)
5135                         return adj;
5136         }
5137         return NULL;
5138 }
5139
5140 static int __netdev_has_upper_dev(struct net_device *upper_dev, void *data)
5141 {
5142         struct net_device *dev = data;
5143
5144         return upper_dev == dev;
5145 }
5146
5147 /**
5148  * netdev_has_upper_dev - Check if device is linked to an upper device
5149  * @dev: device
5150  * @upper_dev: upper device to check
5151  *
5152  * Find out if a device is linked to specified upper device and return true
5153  * in case it is. Note that this checks only immediate upper device,
5154  * not through a complete stack of devices. The caller must hold the RTNL lock.
5155  */
5156 bool netdev_has_upper_dev(struct net_device *dev,
5157                           struct net_device *upper_dev)
5158 {
5159         ASSERT_RTNL();
5160
5161         return netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5162                                              upper_dev);
5163 }
5164 EXPORT_SYMBOL(netdev_has_upper_dev);
5165
5166 /**
5167  * netdev_has_upper_dev_all - Check if device is linked to an upper device
5168  * @dev: device
5169  * @upper_dev: upper device to check
5170  *
5171  * Find out if a device is linked to specified upper device and return true
5172  * in case it is. Note that this checks the entire upper device chain.
5173  * The caller must hold rcu lock.
5174  */
5175
5176 bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
5177                                   struct net_device *upper_dev)
5178 {
5179         return !!netdev_walk_all_upper_dev_rcu(dev, __netdev_has_upper_dev,
5180                                                upper_dev);
5181 }
5182 EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
5183
5184 /**
5185  * netdev_has_any_upper_dev - Check if device is linked to some device
5186  * @dev: device
5187  *
5188  * Find out if a device is linked to an upper device and return true in case
5189  * it is. The caller must hold the RTNL lock.
5190  */
5191 static bool netdev_has_any_upper_dev(struct net_device *dev)
5192 {
5193         ASSERT_RTNL();
5194
5195         return !list_empty(&dev->adj_list.upper);
5196 }
5197
5198 /**
5199  * netdev_master_upper_dev_get - Get master upper device
5200  * @dev: device
5201  *
5202  * Find a master upper device and return pointer to it or NULL in case
5203  * it's not there. The caller must hold the RTNL lock.
5204  */
5205 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
5206 {
5207         struct netdev_adjacent *upper;
5208
5209         ASSERT_RTNL();
5210
5211         if (list_empty(&dev->adj_list.upper))
5212                 return NULL;
5213
5214         upper = list_first_entry(&dev->adj_list.upper,
5215                                  struct netdev_adjacent, list);
5216         if (likely(upper->master))
5217                 return upper->dev;
5218         return NULL;
5219 }
5220 EXPORT_SYMBOL(netdev_master_upper_dev_get);
5221
5222 /**
5223  * netdev_has_any_lower_dev - Check if device is linked to some device
5224  * @dev: device
5225  *
5226  * Find out if a device is linked to a lower device and return true in case
5227  * it is. The caller must hold the RTNL lock.
5228  */
5229 static bool netdev_has_any_lower_dev(struct net_device *dev)
5230 {
5231         ASSERT_RTNL();
5232
5233         return !list_empty(&dev->adj_list.lower);
5234 }
5235
5236 void *netdev_adjacent_get_private(struct list_head *adj_list)
5237 {
5238         struct netdev_adjacent *adj;
5239
5240         adj = list_entry(adj_list, struct netdev_adjacent, list);
5241
5242         return adj->private;
5243 }
5244 EXPORT_SYMBOL(netdev_adjacent_get_private);
5245
5246 /**
5247  * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
5248  * @dev: device
5249  * @iter: list_head ** of the current position
5250  *
5251  * Gets the next device from the dev's upper list, starting from iter
5252  * position. The caller must hold RCU read lock.
5253  */
5254 struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
5255                                                  struct list_head **iter)
5256 {
5257         struct netdev_adjacent *upper;
5258
5259         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5260
5261         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5262
5263         if (&upper->list == &dev->adj_list.upper)
5264                 return NULL;
5265
5266         *iter = &upper->list;
5267
5268         return upper->dev;
5269 }
5270 EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
5271
5272 static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
5273                                                     struct list_head **iter)
5274 {
5275         struct netdev_adjacent *upper;
5276
5277         WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
5278
5279         upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5280
5281         if (&upper->list == &dev->adj_list.upper)
5282                 return NULL;
5283
5284         *iter = &upper->list;
5285
5286         return upper->dev;
5287 }
5288
5289 int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
5290                                   int (*fn)(struct net_device *dev,
5291                                             void *data),
5292                                   void *data)
5293 {
5294         struct net_device *udev;
5295         struct list_head *iter;
5296         int ret;
5297
5298         for (iter = &dev->adj_list.upper,
5299              udev = netdev_next_upper_dev_rcu(dev, &iter);
5300              udev;
5301              udev = netdev_next_upper_dev_rcu(dev, &iter)) {
5302                 /* first is the upper device itself */
5303                 ret = fn(udev, data);
5304                 if (ret)
5305                         return ret;
5306
5307                 /* then look at all of its upper devices */
5308                 ret = netdev_walk_all_upper_dev_rcu(udev, fn, data);
5309                 if (ret)
5310                         return ret;
5311         }
5312
5313         return 0;
5314 }
5315 EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
5316
5317 /**
5318  * netdev_lower_get_next_private - Get the next ->private from the
5319  *                                 lower neighbour list
5320  * @dev: device
5321  * @iter: list_head ** of the current position
5322  *
5323  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5324  * list, starting from iter position. The caller must hold either hold the
5325  * RTNL lock or its own locking that guarantees that the neighbour lower
5326  * list will remain unchanged.
5327  */
5328 void *netdev_lower_get_next_private(struct net_device *dev,
5329                                     struct list_head **iter)
5330 {
5331         struct netdev_adjacent *lower;
5332
5333         lower = list_entry(*iter, struct netdev_adjacent, list);
5334
5335         if (&lower->list == &dev->adj_list.lower)
5336                 return NULL;
5337
5338         *iter = lower->list.next;
5339
5340         return lower->private;
5341 }
5342 EXPORT_SYMBOL(netdev_lower_get_next_private);
5343
5344 /**
5345  * netdev_lower_get_next_private_rcu - Get the next ->private from the
5346  *                                     lower neighbour list, RCU
5347  *                                     variant
5348  * @dev: device
5349  * @iter: list_head ** of the current position
5350  *
5351  * Gets the next netdev_adjacent->private from the dev's lower neighbour
5352  * list, starting from iter position. The caller must hold RCU read lock.
5353  */
5354 void *netdev_lower_get_next_private_rcu(struct net_device *dev,
5355                                         struct list_head **iter)
5356 {
5357         struct netdev_adjacent *lower;
5358
5359         WARN_ON_ONCE(!rcu_read_lock_held());
5360
5361         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5362
5363         if (&lower->list == &dev->adj_list.lower)
5364                 return NULL;
5365
5366         *iter = &lower->list;
5367
5368         return lower->private;
5369 }
5370 EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
5371
5372 /**
5373  * netdev_lower_get_next - Get the next device from the lower neighbour
5374  *                         list
5375  * @dev: device
5376  * @iter: list_head ** of the current position
5377  *
5378  * Gets the next netdev_adjacent from the dev's lower neighbour
5379  * list, starting from iter position. The caller must hold RTNL lock or
5380  * its own locking that guarantees that the neighbour lower
5381  * list will remain unchanged.
5382  */
5383 void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
5384 {
5385         struct netdev_adjacent *lower;
5386
5387         lower = list_entry(*iter, struct netdev_adjacent, list);
5388
5389         if (&lower->list == &dev->adj_list.lower)
5390                 return NULL;
5391
5392         *iter = lower->list.next;
5393
5394         return lower->dev;
5395 }
5396 EXPORT_SYMBOL(netdev_lower_get_next);
5397
5398 static struct net_device *netdev_next_lower_dev(struct net_device *dev,
5399                                                 struct list_head **iter)
5400 {
5401         struct netdev_adjacent *lower;
5402
5403         lower = list_entry((*iter)->next, struct netdev_adjacent, list);
5404
5405         if (&lower->list == &dev->adj_list.lower)
5406                 return NULL;
5407
5408         *iter = &lower->list;
5409
5410         return lower->dev;
5411 }
5412
5413 int netdev_walk_all_lower_dev(struct net_device *dev,
5414                               int (*fn)(struct net_device *dev,
5415                                         void *data),
5416                               void *data)
5417 {
5418         struct net_device *ldev;
5419         struct list_head *iter;
5420         int ret;
5421
5422         for (iter = &dev->adj_list.lower,
5423              ldev = netdev_next_lower_dev(dev, &iter);
5424              ldev;
5425              ldev = netdev_next_lower_dev(dev, &iter)) {
5426                 /* first is the lower device itself */
5427                 ret = fn(ldev, data);
5428                 if (ret)
5429                         return ret;
5430
5431                 /* then look at all of its lower devices */
5432                 ret = netdev_walk_all_lower_dev(ldev, fn, data);
5433                 if (ret)
5434                         return ret;
5435         }
5436
5437         return 0;
5438 }
5439 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
5440
5441 static struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
5442                                                     struct list_head **iter)
5443 {
5444         struct netdev_adjacent *lower;
5445
5446         lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
5447         if (&lower->list == &dev->adj_list.lower)
5448                 return NULL;
5449
5450         *iter = &lower->list;
5451
5452         return lower->dev;
5453 }
5454
5455 int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
5456                                   int (*fn)(struct net_device *dev,
5457                                             void *data),
5458                                   void *data)
5459 {
5460         struct net_device *ldev;
5461         struct list_head *iter;
5462         int ret;
5463
5464         for (iter = &dev->adj_list.lower,
5465              ldev = netdev_next_lower_dev_rcu(dev, &iter);
5466              ldev;
5467              ldev = netdev_next_lower_dev_rcu(dev, &iter)) {
5468                 /* first is the lower device itself */
5469                 ret = fn(ldev, data);
5470                 if (ret)
5471                         return ret;
5472
5473                 /* then look at all of its lower devices */
5474                 ret = netdev_walk_all_lower_dev_rcu(ldev, fn, data);
5475                 if (ret)
5476                         return ret;
5477         }
5478
5479         return 0;
5480 }
5481 EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
5482
5483 /**
5484  * netdev_lower_get_first_private_rcu - Get the first ->private from the
5485  *                                     lower neighbour list, RCU
5486  *                                     variant
5487  * @dev: device
5488  *
5489  * Gets the first netdev_adjacent->private from the dev's lower neighbour
5490  * list. The caller must hold RCU read lock.
5491  */
5492 void *netdev_lower_get_first_private_rcu(struct net_device *dev)
5493 {
5494         struct netdev_adjacent *lower;
5495
5496         lower = list_first_or_null_rcu(&dev->adj_list.lower,
5497                         struct netdev_adjacent, list);
5498         if (lower)
5499                 return lower->private;
5500         return NULL;
5501 }
5502 EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
5503
5504 /**
5505  * netdev_master_upper_dev_get_rcu - Get master upper device
5506  * @dev: device
5507  *
5508  * Find a master upper device and return pointer to it or NULL in case
5509  * it's not there. The caller must hold the RCU read lock.
5510  */
5511 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
5512 {
5513         struct netdev_adjacent *upper;
5514
5515         upper = list_first_or_null_rcu(&dev->adj_list.upper,
5516                                        struct netdev_adjacent, list);
5517         if (upper && likely(upper->master))
5518                 return upper->dev;
5519         return NULL;
5520 }
5521 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
5522
5523 static int netdev_adjacent_sysfs_add(struct net_device *dev,
5524                               struct net_device *adj_dev,
5525                               struct list_head *dev_list)
5526 {
5527         char linkname[IFNAMSIZ+7];
5528         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5529                 "upper_%s" : "lower_%s", adj_dev->name);
5530         return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
5531                                  linkname);
5532 }
5533 static void netdev_adjacent_sysfs_del(struct net_device *dev,
5534                                char *name,
5535                                struct list_head *dev_list)
5536 {
5537         char linkname[IFNAMSIZ+7];
5538         sprintf(linkname, dev_list == &dev->adj_list.upper ?
5539                 "upper_%s" : "lower_%s", name);
5540         sysfs_remove_link(&(dev->dev.kobj), linkname);
5541 }
5542
5543 static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
5544                                                  struct net_device *adj_dev,
5545                                                  struct list_head *dev_list)
5546 {
5547         return (dev_list == &dev->adj_list.upper ||
5548                 dev_list == &dev->adj_list.lower) &&
5549                 net_eq(dev_net(dev), dev_net(adj_dev));
5550 }
5551
5552 static int __netdev_adjacent_dev_insert(struct net_device *dev,
5553                                         struct net_device *adj_dev,
5554                                         struct list_head *dev_list,
5555                                         void *private, bool master)
5556 {
5557         struct netdev_adjacent *adj;
5558         int ret;
5559
5560         adj = __netdev_find_adj(adj_dev, dev_list);
5561
5562         if (adj) {
5563                 adj->ref_nr += 1;
5564                 pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
5565                          dev->name, adj_dev->name, adj->ref_nr);
5566
5567                 return 0;
5568         }
5569
5570         adj = kmalloc(sizeof(*adj), GFP_KERNEL);
5571         if (!adj)
5572                 return -ENOMEM;
5573
5574         adj->dev = adj_dev;
5575         adj->master = master;
5576         adj->ref_nr = 1;
5577         adj->private = private;
5578         dev_hold(adj_dev);
5579
5580         pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
5581                  dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
5582
5583         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
5584                 ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
5585                 if (ret)
5586                         goto free_adj;
5587         }
5588
5589         /* Ensure that master link is always the first item in list. */
5590         if (master) {
5591                 ret = sysfs_create_link(&(dev->dev.kobj),
5592                                         &(adj_dev->dev.kobj), "master");
5593                 if (ret)
5594                         goto remove_symlinks;
5595
5596                 list_add_rcu(&adj->list, dev_list);
5597         } else {
5598                 list_add_tail_rcu(&adj->list, dev_list);
5599         }
5600
5601         return 0;
5602
5603 remove_symlinks:
5604         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5605                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5606 free_adj:
5607         kfree(adj);
5608         dev_put(adj_dev);
5609
5610         return ret;
5611 }
5612
5613 static void __netdev_adjacent_dev_remove(struct net_device *dev,
5614                                          struct net_device *adj_dev,
5615                                          u16 ref_nr,
5616                                          struct list_head *dev_list)
5617 {
5618         struct netdev_adjacent *adj;
5619
5620         pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
5621                  dev->name, adj_dev->name, ref_nr);
5622
5623         adj = __netdev_find_adj(adj_dev, dev_list);
5624
5625         if (!adj) {
5626                 pr_err("Adjacency does not exist for device %s from %s\n",
5627                        dev->name, adj_dev->name);
5628                 WARN_ON(1);
5629                 return;
5630         }
5631
5632         if (adj->ref_nr > ref_nr) {
5633                 pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
5634                          dev->name, adj_dev->name, ref_nr,
5635                          adj->ref_nr - ref_nr);
5636                 adj->ref_nr -= ref_nr;
5637                 return;
5638         }
5639
5640         if (adj->master)
5641                 sysfs_remove_link(&(dev->dev.kobj), "master");
5642
5643         if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
5644                 netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
5645
5646         list_del_rcu(&adj->list);
5647         pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
5648                  adj_dev->name, dev->name, adj_dev->name);
5649         dev_put(adj_dev);
5650         kfree_rcu(adj, rcu);
5651 }
5652
5653 static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
5654                                             struct net_device *upper_dev,
5655                                             struct list_head *up_list,
5656                                             struct list_head *down_list,
5657                                             void *private, bool master)
5658 {
5659         int ret;
5660
5661         ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
5662                                            private, master);
5663         if (ret)
5664                 return ret;
5665
5666         ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
5667                                            private, false);
5668         if (ret) {
5669                 __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
5670                 return ret;
5671         }
5672
5673         return 0;
5674 }
5675
5676 static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
5677                                                struct net_device *upper_dev,
5678                                                u16 ref_nr,
5679                                                struct list_head *up_list,
5680                                                struct list_head *down_list)
5681 {
5682         __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
5683         __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
5684 }
5685
5686 static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
5687                                                 struct net_device *upper_dev,
5688                                                 void *private, bool master)
5689 {
5690         return __netdev_adjacent_dev_link_lists(dev, upper_dev,
5691                                                 &dev->adj_list.upper,
5692                                                 &upper_dev->adj_list.lower,
5693                                                 private, master);
5694 }
5695
5696 static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
5697                                                    struct net_device *upper_dev)
5698 {
5699         __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
5700                                            &dev->adj_list.upper,
5701                                            &upper_dev->adj_list.lower);
5702 }
5703
5704 static int __netdev_upper_dev_link(struct net_device *dev,
5705                                    struct net_device *upper_dev, bool master,
5706                                    void *upper_priv, void *upper_info)
5707 {
5708         struct netdev_notifier_changeupper_info changeupper_info;
5709         int ret = 0;
5710
5711         ASSERT_RTNL();
5712
5713         if (dev == upper_dev)
5714                 return -EBUSY;
5715
5716         /* To prevent loops, check if dev is not upper device to upper_dev. */
5717         if (netdev_has_upper_dev(upper_dev, dev))
5718                 return -EBUSY;
5719
5720         if (netdev_has_upper_dev(dev, upper_dev))
5721                 return -EEXIST;
5722
5723         if (master && netdev_master_upper_dev_get(dev))
5724                 return -EBUSY;
5725
5726         changeupper_info.upper_dev = upper_dev;
5727         changeupper_info.master = master;
5728         changeupper_info.linking = true;
5729         changeupper_info.upper_info = upper_info;
5730
5731         ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5732                                             &changeupper_info.info);
5733         ret = notifier_to_errno(ret);
5734         if (ret)
5735                 return ret;
5736
5737         ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
5738                                                    master);
5739         if (ret)
5740                 return ret;
5741
5742         ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5743                                             &changeupper_info.info);
5744         ret = notifier_to_errno(ret);
5745         if (ret)
5746                 goto rollback;
5747
5748         return 0;
5749
5750 rollback:
5751         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5752
5753         return ret;
5754 }
5755
5756 /**
5757  * netdev_upper_dev_link - Add a link to the upper device
5758  * @dev: device
5759  * @upper_dev: new upper device
5760  *
5761  * Adds a link to device which is upper to this one. The caller must hold
5762  * the RTNL lock. On a failure a negative errno code is returned.
5763  * On success the reference counts are adjusted and the function
5764  * returns zero.
5765  */
5766 int netdev_upper_dev_link(struct net_device *dev,
5767                           struct net_device *upper_dev)
5768 {
5769         return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
5770 }
5771 EXPORT_SYMBOL(netdev_upper_dev_link);
5772
5773 /**
5774  * netdev_master_upper_dev_link - Add a master link to the upper device
5775  * @dev: device
5776  * @upper_dev: new upper device
5777  * @upper_priv: upper device private
5778  * @upper_info: upper info to be passed down via notifier
5779  *
5780  * Adds a link to device which is upper to this one. In this case, only
5781  * one master upper device can be linked, although other non-master devices
5782  * might be linked as well. The caller must hold the RTNL lock.
5783  * On a failure a negative errno code is returned. On success the reference
5784  * counts are adjusted and the function returns zero.
5785  */
5786 int netdev_master_upper_dev_link(struct net_device *dev,
5787                                  struct net_device *upper_dev,
5788                                  void *upper_priv, void *upper_info)
5789 {
5790         return __netdev_upper_dev_link(dev, upper_dev, true,
5791                                        upper_priv, upper_info);
5792 }
5793 EXPORT_SYMBOL(netdev_master_upper_dev_link);
5794
5795 /**
5796  * netdev_upper_dev_unlink - Removes a link to upper device
5797  * @dev: device
5798  * @upper_dev: new upper device
5799  *
5800  * Removes a link to device which is upper to this one. The caller must hold
5801  * the RTNL lock.
5802  */
5803 void netdev_upper_dev_unlink(struct net_device *dev,
5804                              struct net_device *upper_dev)
5805 {
5806         struct netdev_notifier_changeupper_info changeupper_info;
5807         ASSERT_RTNL();
5808
5809         changeupper_info.upper_dev = upper_dev;
5810         changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
5811         changeupper_info.linking = false;
5812
5813         call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
5814                                       &changeupper_info.info);
5815
5816         __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
5817
5818         call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
5819                                       &changeupper_info.info);
5820 }
5821 EXPORT_SYMBOL(netdev_upper_dev_unlink);
5822
5823 /**
5824  * netdev_bonding_info_change - Dispatch event about slave change
5825  * @dev: device
5826  * @bonding_info: info to dispatch
5827  *
5828  * Send NETDEV_BONDING_INFO to netdev notifiers with info.
5829  * The caller must hold the RTNL lock.
5830  */
5831 void netdev_bonding_info_change(struct net_device *dev,
5832                                 struct netdev_bonding_info *bonding_info)
5833 {
5834         struct netdev_notifier_bonding_info     info;
5835
5836         memcpy(&info.bonding_info, bonding_info,
5837                sizeof(struct netdev_bonding_info));
5838         call_netdevice_notifiers_info(NETDEV_BONDING_INFO, dev,
5839                                       &info.info);
5840 }
5841 EXPORT_SYMBOL(netdev_bonding_info_change);
5842
5843 static void netdev_adjacent_add_links(struct net_device *dev)
5844 {
5845         struct netdev_adjacent *iter;
5846
5847         struct net *net = dev_net(dev);
5848
5849         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5850                 if (!net_eq(net, dev_net(iter->dev)))
5851                         continue;
5852                 netdev_adjacent_sysfs_add(iter->dev, dev,
5853                                           &iter->dev->adj_list.lower);
5854                 netdev_adjacent_sysfs_add(dev, iter->dev,
5855                                           &dev->adj_list.upper);
5856         }
5857
5858         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5859                 if (!net_eq(net, dev_net(iter->dev)))
5860                         continue;
5861                 netdev_adjacent_sysfs_add(iter->dev, dev,
5862                                           &iter->dev->adj_list.upper);
5863                 netdev_adjacent_sysfs_add(dev, iter->dev,
5864                                           &dev->adj_list.lower);
5865         }
5866 }
5867
5868 static void netdev_adjacent_del_links(struct net_device *dev)
5869 {
5870         struct netdev_adjacent *iter;
5871
5872         struct net *net = dev_net(dev);
5873
5874         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5875                 if (!net_eq(net, dev_net(iter->dev)))
5876                         continue;
5877                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5878                                           &iter->dev->adj_list.lower);
5879                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5880                                           &dev->adj_list.upper);
5881         }
5882
5883         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5884                 if (!net_eq(net, dev_net(iter->dev)))
5885                         continue;
5886                 netdev_adjacent_sysfs_del(iter->dev, dev->name,
5887                                           &iter->dev->adj_list.upper);
5888                 netdev_adjacent_sysfs_del(dev, iter->dev->name,
5889                                           &dev->adj_list.lower);
5890         }
5891 }
5892
5893 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
5894 {
5895         struct netdev_adjacent *iter;
5896
5897         struct net *net = dev_net(dev);
5898
5899         list_for_each_entry(iter, &dev->adj_list.upper, list) {
5900                 if (!net_eq(net, dev_net(iter->dev)))
5901                         continue;
5902                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5903                                           &iter->dev->adj_list.lower);
5904                 netdev_adjacent_sysfs_add(iter->dev, dev,
5905                                           &iter->dev->adj_list.lower);
5906         }
5907
5908         list_for_each_entry(iter, &dev->adj_list.lower, list) {
5909                 if (!net_eq(net, dev_net(iter->dev)))
5910                         continue;
5911                 netdev_adjacent_sysfs_del(iter->dev, oldname,
5912                                           &iter->dev->adj_list.upper);
5913                 netdev_adjacent_sysfs_add(iter->dev, dev,
5914                                           &iter->dev->adj_list.upper);
5915         }
5916 }
5917
5918 void *netdev_lower_dev_get_private(struct net_device *dev,
5919                                    struct net_device *lower_dev)
5920 {
5921         struct netdev_adjacent *lower;
5922
5923         if (!lower_dev)
5924                 return NULL;
5925         lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
5926         if (!lower)
5927                 return NULL;
5928
5929         return lower->private;
5930 }
5931 EXPORT_SYMBOL(netdev_lower_dev_get_private);
5932
5933
5934 int dev_get_nest_level(struct net_device *dev)
5935 {
5936         struct net_device *lower = NULL;
5937         struct list_head *iter;
5938         int max_nest = -1;
5939         int nest;
5940
5941         ASSERT_RTNL();
5942
5943         netdev_for_each_lower_dev(dev, lower, iter) {
5944                 nest = dev_get_nest_level(lower);
5945                 if (max_nest < nest)
5946                         max_nest = nest;
5947         }
5948
5949         return max_nest + 1;
5950 }
5951 EXPORT_SYMBOL(dev_get_nest_level);
5952
5953 /**
5954  * netdev_lower_change - Dispatch event about lower device state change
5955  * @lower_dev: device
5956  * @lower_state_info: state to dispatch
5957  *
5958  * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
5959  * The caller must hold the RTNL lock.
5960  */
5961 void netdev_lower_state_changed(struct net_device *lower_dev,
5962                                 void *lower_state_info)
5963 {
5964         struct netdev_notifier_changelowerstate_info changelowerstate_info;
5965
5966         ASSERT_RTNL();
5967         changelowerstate_info.lower_state_info = lower_state_info;
5968         call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
5969                                       &changelowerstate_info.info);
5970 }
5971 EXPORT_SYMBOL(netdev_lower_state_changed);
5972
5973 int netdev_default_l2upper_neigh_construct(struct net_device *dev,
5974                                            struct neighbour *n)
5975 {
5976         struct net_device *lower_dev, *stop_dev;
5977         struct list_head *iter;
5978         int err;
5979
5980         netdev_for_each_lower_dev(dev, lower_dev, iter) {
5981                 if (!lower_dev->netdev_ops->ndo_neigh_construct)
5982                         continue;
5983                 err = lower_dev->netdev_ops->ndo_neigh_construct(lower_dev, n);
5984                 if (err) {
5985                         stop_dev = lower_dev;
5986                         goto rollback;
5987                 }
5988         }
5989         return 0;
5990
5991 rollback:
5992         netdev_for_each_lower_dev(dev, lower_dev, iter) {
5993                 if (lower_dev == stop_dev)
5994                         break;
5995                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
5996                         continue;
5997                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
5998         }
5999         return err;
6000 }
6001 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_construct);
6002
6003 void netdev_default_l2upper_neigh_destroy(struct net_device *dev,
6004                                           struct neighbour *n)
6005 {
6006         struct net_device *lower_dev;
6007         struct list_head *iter;
6008
6009         netdev_for_each_lower_dev(dev, lower_dev, iter) {
6010                 if (!lower_dev->netdev_ops->ndo_neigh_destroy)
6011                         continue;
6012                 lower_dev->netdev_ops->ndo_neigh_destroy(lower_dev, n);
6013         }
6014 }
6015 EXPORT_SYMBOL_GPL(netdev_default_l2upper_neigh_destroy);
6016
6017 static void dev_change_rx_flags(struct net_device *dev, int flags)
6018 {
6019         const struct net_device_ops *ops = dev->netdev_ops;
6020
6021         if (ops->ndo_change_rx_flags)
6022                 ops->ndo_change_rx_flags(dev, flags);
6023 }
6024
6025 static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
6026 {
6027         unsigned int old_flags = dev->flags;
6028         kuid_t uid;
6029         kgid_t gid;
6030
6031         ASSERT_RTNL();
6032
6033         dev->flags |= IFF_PROMISC;
6034         dev->promiscuity += inc;
6035         if (dev->promiscuity == 0) {
6036                 /*
6037                  * Avoid overflow.
6038                  * If inc causes overflow, untouch promisc and return error.
6039                  */
6040                 if (inc < 0)
6041                         dev->flags &= ~IFF_PROMISC;
6042                 else {
6043                         dev->promiscuity -= inc;
6044                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
6045                                 dev->name);
6046                         return -EOVERFLOW;
6047                 }
6048         }
6049         if (dev->flags != old_flags) {
6050                 pr_info("device %s %s promiscuous mode\n",
6051                         dev->name,
6052                         dev->flags & IFF_PROMISC ? "entered" : "left");
6053                 if (audit_enabled) {
6054                         current_uid_gid(&uid, &gid);
6055                         audit_log(current->audit_context, GFP_ATOMIC,
6056                                 AUDIT_ANOM_PROMISCUOUS,
6057                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
6058                                 dev->name, (dev->flags & IFF_PROMISC),
6059                                 (old_flags & IFF_PROMISC),
6060                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
6061                                 from_kuid(&init_user_ns, uid),
6062                                 from_kgid(&init_user_ns, gid),
6063                                 audit_get_sessionid(current));
6064                 }
6065
6066                 dev_change_rx_flags(dev, IFF_PROMISC);
6067         }
6068         if (notify)
6069                 __dev_notify_flags(dev, old_flags, IFF_PROMISC);
6070         return 0;
6071 }
6072
6073 /**
6074  *      dev_set_promiscuity     - update promiscuity count on a device
6075  *      @dev: device
6076  *      @inc: modifier
6077  *
6078  *      Add or remove promiscuity from a device. While the count in the device
6079  *      remains above zero the interface remains promiscuous. Once it hits zero
6080  *      the device reverts back to normal filtering operation. A negative inc
6081  *      value is used to drop promiscuity on the device.
6082  *      Return 0 if successful or a negative errno code on error.
6083  */
6084 int dev_set_promiscuity(struct net_device *dev, int inc)
6085 {
6086         unsigned int old_flags = dev->flags;
6087         int err;
6088
6089         err = __dev_set_promiscuity(dev, inc, true);
6090         if (err < 0)
6091                 return err;
6092         if (dev->flags != old_flags)
6093                 dev_set_rx_mode(dev);
6094         return err;
6095 }
6096 EXPORT_SYMBOL(dev_set_promiscuity);
6097
6098 static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
6099 {
6100         unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
6101
6102         ASSERT_RTNL();
6103
6104         dev->flags |= IFF_ALLMULTI;
6105         dev->allmulti += inc;
6106         if (dev->allmulti == 0) {
6107                 /*
6108                  * Avoid overflow.
6109                  * If inc causes overflow, untouch allmulti and return error.
6110                  */
6111                 if (inc < 0)
6112                         dev->flags &= ~IFF_ALLMULTI;
6113                 else {
6114                         dev->allmulti -= inc;
6115                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
6116                                 dev->name);
6117                         return -EOVERFLOW;
6118                 }
6119         }
6120         if (dev->flags ^ old_flags) {
6121                 dev_change_rx_flags(dev, IFF_ALLMULTI);
6122                 dev_set_rx_mode(dev);
6123                 if (notify)
6124                         __dev_notify_flags(dev, old_flags,
6125                                            dev->gflags ^ old_gflags);
6126         }
6127         return 0;
6128 }
6129
6130 /**
6131  *      dev_set_allmulti        - update allmulti count on a device
6132  *      @dev: device
6133  *      @inc: modifier
6134  *
6135  *      Add or remove reception of all multicast frames to a device. While the
6136  *      count in the device remains above zero the interface remains listening
6137  *      to all interfaces. Once it hits zero the device reverts back to normal
6138  *      filtering operation. A negative @inc value is used to drop the counter
6139  *      when releasing a resource needing all multicasts.
6140  *      Return 0 if successful or a negative errno code on error.
6141  */
6142
6143 int dev_set_allmulti(struct net_device *dev, int inc)
6144 {
6145         return __dev_set_allmulti(dev, inc, true);
6146 }
6147 EXPORT_SYMBOL(dev_set_allmulti);
6148
6149 /*
6150  *      Upload unicast and multicast address lists to device and
6151  *      configure RX filtering. When the device doesn't support unicast
6152  *      filtering it is put in promiscuous mode while unicast addresses
6153  *      are present.
6154  */
6155 void __dev_set_rx_mode(struct net_device *dev)
6156 {
6157         const struct net_device_ops *ops = dev->netdev_ops;
6158
6159         /* dev_open will call this function so the list will stay sane. */
6160         if (!(dev->flags&IFF_UP))
6161                 return;
6162
6163         if (!netif_device_present(dev))
6164                 return;
6165
6166         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
6167                 /* Unicast addresses changes may only happen under the rtnl,
6168                  * therefore calling __dev_set_promiscuity here is safe.
6169                  */
6170                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
6171                         __dev_set_promiscuity(dev, 1, false);
6172                         dev->uc_promisc = true;
6173                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
6174                         __dev_set_promiscuity(dev, -1, false);
6175                         dev->uc_promisc = false;
6176                 }
6177         }
6178
6179         if (ops->ndo_set_rx_mode)
6180                 ops->ndo_set_rx_mode(dev);
6181 }
6182
6183 void dev_set_rx_mode(struct net_device *dev)
6184 {
6185         netif_addr_lock_bh(dev);
6186         __dev_set_rx_mode(dev);
6187         netif_addr_unlock_bh(dev);
6188 }
6189
6190 /**
6191  *      dev_get_flags - get flags reported to userspace
6192  *      @dev: device
6193  *
6194  *      Get the combination of flag bits exported through APIs to userspace.
6195  */
6196 unsigned int dev_get_flags(const struct net_device *dev)
6197 {
6198         unsigned int flags;
6199
6200         flags = (dev->flags & ~(IFF_PROMISC |
6201                                 IFF_ALLMULTI |
6202                                 IFF_RUNNING |
6203                                 IFF_LOWER_UP |
6204                                 IFF_DORMANT)) |
6205                 (dev->gflags & (IFF_PROMISC |
6206                                 IFF_ALLMULTI));
6207
6208         if (netif_running(dev)) {
6209                 if (netif_oper_up(dev))
6210                         flags |= IFF_RUNNING;
6211                 if (netif_carrier_ok(dev))
6212                         flags |= IFF_LOWER_UP;
6213                 if (netif_dormant(dev))
6214                         flags |= IFF_DORMANT;
6215         }
6216
6217         return flags;
6218 }
6219 EXPORT_SYMBOL(dev_get_flags);
6220
6221 int __dev_change_flags(struct net_device *dev, unsigned int flags)
6222 {
6223         unsigned int old_flags = dev->flags;
6224         int ret;
6225
6226         ASSERT_RTNL();
6227
6228         /*
6229          *      Set the flags on our device.
6230          */
6231
6232         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
6233                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
6234                                IFF_AUTOMEDIA)) |
6235                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
6236                                     IFF_ALLMULTI));
6237
6238         /*
6239          *      Load in the correct multicast list now the flags have changed.
6240          */
6241
6242         if ((old_flags ^ flags) & IFF_MULTICAST)
6243                 dev_change_rx_flags(dev, IFF_MULTICAST);
6244
6245         dev_set_rx_mode(dev);
6246
6247         /*
6248          *      Have we downed the interface. We handle IFF_UP ourselves
6249          *      according to user attempts to set it, rather than blindly
6250          *      setting it.
6251          */
6252
6253         ret = 0;
6254         if ((old_flags ^ flags) & IFF_UP)
6255                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
6256
6257         if ((flags ^ dev->gflags) & IFF_PROMISC) {
6258                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
6259                 unsigned int old_flags = dev->flags;
6260
6261                 dev->gflags ^= IFF_PROMISC;
6262
6263                 if (__dev_set_promiscuity(dev, inc, false) >= 0)
6264                         if (dev->flags != old_flags)
6265                                 dev_set_rx_mode(dev);
6266         }
6267
6268         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
6269            is important. Some (broken) drivers set IFF_PROMISC, when
6270            IFF_ALLMULTI is requested not asking us and not reporting.
6271          */
6272         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
6273                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
6274
6275                 dev->gflags ^= IFF_ALLMULTI;
6276                 __dev_set_allmulti(dev, inc, false);
6277         }
6278
6279         return ret;
6280 }
6281
6282 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
6283                         unsigned int gchanges)
6284 {
6285         unsigned int changes = dev->flags ^ old_flags;
6286
6287         if (gchanges)
6288                 rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
6289
6290         if (changes & IFF_UP) {
6291                 if (dev->flags & IFF_UP)
6292                         call_netdevice_notifiers(NETDEV_UP, dev);
6293                 else
6294                         call_netdevice_notifiers(NETDEV_DOWN, dev);
6295         }
6296
6297         if (dev->flags & IFF_UP &&
6298             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
6299                 struct netdev_notifier_change_info change_info;
6300
6301                 change_info.flags_changed = changes;
6302                 call_netdevice_notifiers_info(NETDEV_CHANGE, dev,
6303                                               &change_info.info);
6304         }
6305 }
6306
6307 /**
6308  *      dev_change_flags - change device settings
6309  *      @dev: device
6310  *      @flags: device state flags
6311  *
6312  *      Change settings on device based state flags. The flags are
6313  *      in the userspace exported format.
6314  */
6315 int dev_change_flags(struct net_device *dev, unsigned int flags)
6316 {
6317         int ret;
6318         unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
6319
6320         ret = __dev_change_flags(dev, flags);
6321         if (ret < 0)
6322                 return ret;
6323
6324         changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
6325         __dev_notify_flags(dev, old_flags, changes);
6326         return ret;
6327 }
6328 EXPORT_SYMBOL(dev_change_flags);
6329
6330 static int __dev_set_mtu(struct net_device *dev, int new_mtu)
6331 {
6332         const struct net_device_ops *ops = dev->netdev_ops;
6333
6334         if (ops->ndo_change_mtu)
6335                 return ops->ndo_change_mtu(dev, new_mtu);
6336
6337         dev->mtu = new_mtu;
6338         return 0;
6339 }
6340
6341 /**
6342  *      dev_set_mtu - Change maximum transfer unit
6343  *      @dev: device
6344  *      @new_mtu: new transfer unit
6345  *
6346  *      Change the maximum transfer size of the network device.
6347  */
6348 int dev_set_mtu(struct net_device *dev, int new_mtu)
6349 {
6350         int err, orig_mtu;
6351
6352         if (new_mtu == dev->mtu)
6353                 return 0;
6354
6355         /* MTU must be positive, and in range */
6356         if (new_mtu < 0 || new_mtu < dev->min_mtu) {
6357                 net_err_ratelimited("%s: Invalid MTU %d requested, hw min %d\n",
6358                                     dev->name, new_mtu, dev->min_mtu);
6359                 return -EINVAL;
6360         }
6361
6362         if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
6363                 net_err_ratelimited("%s: Invalid MTU %d requested, hw max %d\n",
6364                                     dev->name, new_mtu, dev->max_mtu);
6365                 return -EINVAL;
6366         }
6367
6368         if (!netif_device_present(dev))
6369                 return -ENODEV;
6370
6371         err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
6372         err = notifier_to_errno(err);
6373         if (err)
6374                 return err;
6375
6376         orig_mtu = dev->mtu;
6377         err = __dev_set_mtu(dev, new_mtu);
6378
6379         if (!err) {
6380                 err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6381                 err = notifier_to_errno(err);
6382                 if (err) {
6383                         /* setting mtu back and notifying everyone again,
6384                          * so that they have a chance to revert changes.
6385                          */
6386                         __dev_set_mtu(dev, orig_mtu);
6387                         call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
6388                 }
6389         }
6390         return err;
6391 }
6392 EXPORT_SYMBOL(dev_set_mtu);
6393
6394 /**
6395  *      dev_set_group - Change group this device belongs to
6396  *      @dev: device
6397  *      @new_group: group this device should belong to
6398  */
6399 void dev_set_group(struct net_device *dev, int new_group)
6400 {
6401         dev->group = new_group;
6402 }
6403 EXPORT_SYMBOL(dev_set_group);
6404
6405 /**
6406  *      dev_set_mac_address - Change Media Access Control Address
6407  *      @dev: device
6408  *      @sa: new address
6409  *
6410  *      Change the hardware (MAC) address of the device
6411  */
6412 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
6413 {
6414         const struct net_device_ops *ops = dev->netdev_ops;
6415         int err;
6416
6417         if (!ops->ndo_set_mac_address)
6418                 return -EOPNOTSUPP;
6419         if (sa->sa_family != dev->type)
6420                 return -EINVAL;
6421         if (!netif_device_present(dev))
6422                 return -ENODEV;
6423         err = ops->ndo_set_mac_address(dev, sa);
6424         if (err)
6425                 return err;
6426         dev->addr_assign_type = NET_ADDR_SET;
6427         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
6428         add_device_randomness(dev->dev_addr, dev->addr_len);
6429         return 0;
6430 }
6431 EXPORT_SYMBOL(dev_set_mac_address);
6432
6433 /**
6434  *      dev_change_carrier - Change device carrier
6435  *      @dev: device
6436  *      @new_carrier: new value
6437  *
6438  *      Change device carrier
6439  */
6440 int dev_change_carrier(struct net_device *dev, bool new_carrier)
6441 {
6442         const struct net_device_ops *ops = dev->netdev_ops;
6443
6444         if (!ops->ndo_change_carrier)
6445                 return -EOPNOTSUPP;
6446         if (!netif_device_present(dev))
6447                 return -ENODEV;
6448         return ops->ndo_change_carrier(dev, new_carrier);
6449 }
6450 EXPORT_SYMBOL(dev_change_carrier);
6451
6452 /**
6453  *      dev_get_phys_port_id - Get device physical port ID
6454  *      @dev: device
6455  *      @ppid: port ID
6456  *
6457  *      Get device physical port ID
6458  */
6459 int dev_get_phys_port_id(struct net_device *dev,
6460                          struct netdev_phys_item_id *ppid)
6461 {
6462         const struct net_device_ops *ops = dev->netdev_ops;
6463
6464         if (!ops->ndo_get_phys_port_id)
6465                 return -EOPNOTSUPP;
6466         return ops->ndo_get_phys_port_id(dev, ppid);
6467 }
6468 EXPORT_SYMBOL(dev_get_phys_port_id);
6469
6470 /**
6471  *      dev_get_phys_port_name - Get device physical port name
6472  *      @dev: device
6473  *      @name: port name
6474  *      @len: limit of bytes to copy to name
6475  *
6476  *      Get device physical port name
6477  */
6478 int dev_get_phys_port_name(struct net_device *dev,
6479                            char *name, size_t len)
6480 {
6481         const struct net_device_ops *ops = dev->netdev_ops;
6482
6483         if (!ops->ndo_get_phys_port_name)
6484                 return -EOPNOTSUPP;
6485         return ops->ndo_get_phys_port_name(dev, name, len);
6486 }
6487 EXPORT_SYMBOL(dev_get_phys_port_name);
6488
6489 /**
6490  *      dev_change_proto_down - update protocol port state information
6491  *      @dev: device
6492  *      @proto_down: new value
6493  *
6494  *      This info can be used by switch drivers to set the phys state of the
6495  *      port.
6496  */
6497 int dev_change_proto_down(struct net_device *dev, bool proto_down)
6498 {
6499         const struct net_device_ops *ops = dev->netdev_ops;
6500
6501         if (!ops->ndo_change_proto_down)
6502                 return -EOPNOTSUPP;
6503         if (!netif_device_present(dev))
6504                 return -ENODEV;
6505         return ops->ndo_change_proto_down(dev, proto_down);
6506 }
6507 EXPORT_SYMBOL(dev_change_proto_down);
6508
6509 /**
6510  *      dev_change_xdp_fd - set or clear a bpf program for a device rx path
6511  *      @dev: device
6512  *      @fd: new program fd or negative value to clear
6513  *
6514  *      Set or clear a bpf program for a device
6515  */
6516 int dev_change_xdp_fd(struct net_device *dev, int fd)
6517 {
6518         const struct net_device_ops *ops = dev->netdev_ops;
6519         struct bpf_prog *prog = NULL;
6520         struct netdev_xdp xdp = {};
6521         int err;
6522
6523         if (!ops->ndo_xdp)
6524                 return -EOPNOTSUPP;
6525         if (fd >= 0) {
6526                 prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_XDP);
6527                 if (IS_ERR(prog))
6528                         return PTR_ERR(prog);
6529         }
6530
6531         xdp.command = XDP_SETUP_PROG;
6532         xdp.prog = prog;
6533         err = ops->ndo_xdp(dev, &xdp);
6534         if (err < 0 && prog)
6535                 bpf_prog_put(prog);
6536
6537         return err;
6538 }
6539 EXPORT_SYMBOL(dev_change_xdp_fd);
6540
6541 /**
6542  *      dev_new_index   -       allocate an ifindex
6543  *      @net: the applicable net namespace
6544  *
6545  *      Returns a suitable unique value for a new device interface
6546  *      number.  The caller must hold the rtnl semaphore or the
6547  *      dev_base_lock to be sure it remains unique.
6548  */
6549 static int dev_new_index(struct net *net)
6550 {
6551         int ifindex = net->ifindex;
6552         for (;;) {
6553                 if (++ifindex <= 0)
6554                         ifindex = 1;
6555                 if (!__dev_get_by_index(net, ifindex))
6556                         return net->ifindex = ifindex;
6557         }
6558 }
6559
6560 /* Delayed registration/unregisteration */
6561 static LIST_HEAD(net_todo_list);
6562 DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
6563
6564 static void net_set_todo(struct net_device *dev)
6565 {
6566         list_add_tail(&dev->todo_list, &net_todo_list);
6567         dev_net(dev)->dev_unreg_count++;
6568 }
6569
6570 static void rollback_registered_many(struct list_head *head)
6571 {
6572         struct net_device *dev, *tmp;
6573         LIST_HEAD(close_head);
6574
6575         BUG_ON(dev_boot_phase);
6576         ASSERT_RTNL();
6577
6578         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
6579                 /* Some devices call without registering
6580                  * for initialization unwind. Remove those
6581                  * devices and proceed with the remaining.
6582                  */
6583                 if (dev->reg_state == NETREG_UNINITIALIZED) {
6584                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
6585                                  dev->name, dev);
6586
6587                         WARN_ON(1);
6588                         list_del(&dev->unreg_list);
6589                         continue;
6590                 }
6591                 dev->dismantle = true;
6592                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
6593         }
6594
6595         /* If device is running, close it first. */
6596         list_for_each_entry(dev, head, unreg_list)
6597                 list_add_tail(&dev->close_list, &close_head);
6598         dev_close_many(&close_head, true);
6599
6600         list_for_each_entry(dev, head, unreg_list) {
6601                 /* And unlink it from device chain. */
6602                 unlist_netdevice(dev);
6603
6604                 dev->reg_state = NETREG_UNREGISTERING;
6605         }
6606         flush_all_backlogs();
6607
6608         synchronize_net();
6609
6610         list_for_each_entry(dev, head, unreg_list) {
6611                 struct sk_buff *skb = NULL;
6612
6613                 /* Shutdown queueing discipline. */
6614                 dev_shutdown(dev);
6615
6616
6617                 /* Notify protocols, that we are about to destroy
6618                    this device. They should clean all the things.
6619                 */
6620                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6621
6622                 if (!dev->rtnl_link_ops ||
6623                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
6624                         skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U,
6625                                                      GFP_KERNEL);
6626
6627                 /*
6628                  *      Flush the unicast and multicast chains
6629                  */
6630                 dev_uc_flush(dev);
6631                 dev_mc_flush(dev);
6632
6633                 if (dev->netdev_ops->ndo_uninit)
6634                         dev->netdev_ops->ndo_uninit(dev);
6635
6636                 if (skb)
6637                         rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
6638
6639                 /* Notifier chain MUST detach us all upper devices. */
6640                 WARN_ON(netdev_has_any_upper_dev(dev));
6641                 WARN_ON(netdev_has_any_lower_dev(dev));
6642
6643                 /* Remove entries from kobject tree */
6644                 netdev_unregister_kobject(dev);
6645 #ifdef CONFIG_XPS
6646                 /* Remove XPS queueing entries */
6647                 netif_reset_xps_queues_gt(dev, 0);
6648 #endif
6649         }
6650
6651         synchronize_net();
6652
6653         list_for_each_entry(dev, head, unreg_list)
6654                 dev_put(dev);
6655 }
6656
6657 static void rollback_registered(struct net_device *dev)
6658 {
6659         LIST_HEAD(single);
6660
6661         list_add(&dev->unreg_list, &single);
6662         rollback_registered_many(&single);
6663         list_del(&single);
6664 }
6665
6666 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
6667         struct net_device *upper, netdev_features_t features)
6668 {
6669         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6670         netdev_features_t feature;
6671         int feature_bit;
6672
6673         for_each_netdev_feature(&upper_disables, feature_bit) {
6674                 feature = __NETIF_F_BIT(feature_bit);
6675                 if (!(upper->wanted_features & feature)
6676                     && (features & feature)) {
6677                         netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
6678                                    &feature, upper->name);
6679                         features &= ~feature;
6680                 }
6681         }
6682
6683         return features;
6684 }
6685
6686 static void netdev_sync_lower_features(struct net_device *upper,
6687         struct net_device *lower, netdev_features_t features)
6688 {
6689         netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
6690         netdev_features_t feature;
6691         int feature_bit;
6692
6693         for_each_netdev_feature(&upper_disables, feature_bit) {
6694                 feature = __NETIF_F_BIT(feature_bit);
6695                 if (!(features & feature) && (lower->features & feature)) {
6696                         netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
6697                                    &feature, lower->name);
6698                         lower->wanted_features &= ~feature;
6699                         netdev_update_features(lower);
6700
6701                         if (unlikely(lower->features & feature))
6702                                 netdev_WARN(upper, "failed to disable %pNF on %s!\n",
6703                                             &feature, lower->name);
6704                 }
6705         }
6706 }
6707
6708 static netdev_features_t netdev_fix_features(struct net_device *dev,
6709         netdev_features_t features)
6710 {
6711         /* Fix illegal checksum combinations */
6712         if ((features & NETIF_F_HW_CSUM) &&
6713             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6714                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
6715                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
6716         }
6717
6718         /* TSO requires that SG is present as well. */
6719         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6720                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
6721                 features &= ~NETIF_F_ALL_TSO;
6722         }
6723
6724         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
6725                                         !(features & NETIF_F_IP_CSUM)) {
6726                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
6727                 features &= ~NETIF_F_TSO;
6728                 features &= ~NETIF_F_TSO_ECN;
6729         }
6730
6731         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
6732                                          !(features & NETIF_F_IPV6_CSUM)) {
6733                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
6734                 features &= ~NETIF_F_TSO6;
6735         }
6736
6737         /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
6738         if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
6739                 features &= ~NETIF_F_TSO_MANGLEID;
6740
6741         /* TSO ECN requires that TSO is present as well. */
6742         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
6743                 features &= ~NETIF_F_TSO_ECN;
6744
6745         /* Software GSO depends on SG. */
6746         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6747                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
6748                 features &= ~NETIF_F_GSO;
6749         }
6750
6751         /* UFO needs SG and checksumming */
6752         if (features & NETIF_F_UFO) {
6753                 /* maybe split UFO into V4 and V6? */
6754                 if (!(features & NETIF_F_HW_CSUM) &&
6755                     ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
6756                      (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
6757                         netdev_dbg(dev,
6758                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
6759                         features &= ~NETIF_F_UFO;
6760                 }
6761
6762                 if (!(features & NETIF_F_SG)) {
6763                         netdev_dbg(dev,
6764                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
6765                         features &= ~NETIF_F_UFO;
6766                 }
6767         }
6768
6769         /* GSO partial features require GSO partial be set */
6770         if ((features & dev->gso_partial_features) &&
6771             !(features & NETIF_F_GSO_PARTIAL)) {
6772                 netdev_dbg(dev,
6773                            "Dropping partially supported GSO features since no GSO partial.\n");
6774                 features &= ~dev->gso_partial_features;
6775         }
6776
6777 #ifdef CONFIG_NET_RX_BUSY_POLL
6778         if (dev->netdev_ops->ndo_busy_poll)
6779                 features |= NETIF_F_BUSY_POLL;
6780         else
6781 #endif
6782                 features &= ~NETIF_F_BUSY_POLL;
6783
6784         return features;
6785 }
6786
6787 int __netdev_update_features(struct net_device *dev)
6788 {
6789         struct net_device *upper, *lower;
6790         netdev_features_t features;
6791         struct list_head *iter;
6792         int err = -1;
6793
6794         ASSERT_RTNL();
6795
6796         features = netdev_get_wanted_features(dev);
6797
6798         if (dev->netdev_ops->ndo_fix_features)
6799                 features = dev->netdev_ops->ndo_fix_features(dev, features);
6800
6801         /* driver might be less strict about feature dependencies */
6802         features = netdev_fix_features(dev, features);
6803
6804         /* some features can't be enabled if they're off an an upper device */
6805         netdev_for_each_upper_dev_rcu(dev, upper, iter)
6806                 features = netdev_sync_upper_features(dev, upper, features);
6807
6808         if (dev->features == features)
6809                 goto sync_lower;
6810
6811         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
6812                 &dev->features, &features);
6813
6814         if (dev->netdev_ops->ndo_set_features)
6815                 err = dev->netdev_ops->ndo_set_features(dev, features);
6816         else
6817                 err = 0;
6818
6819         if (unlikely(err < 0)) {
6820                 netdev_err(dev,
6821                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
6822                         err, &features, &dev->features);
6823                 /* return non-0 since some features might have changed and
6824                  * it's better to fire a spurious notification than miss it
6825                  */
6826                 return -1;
6827         }
6828
6829 sync_lower:
6830         /* some features must be disabled on lower devices when disabled
6831          * on an upper device (think: bonding master or bridge)
6832          */
6833         netdev_for_each_lower_dev(dev, lower, iter)
6834                 netdev_sync_lower_features(dev, lower, features);
6835
6836         if (!err)
6837                 dev->features = features;
6838
6839         return err < 0 ? 0 : 1;
6840 }
6841
6842 /**
6843  *      netdev_update_features - recalculate device features
6844  *      @dev: the device to check
6845  *
6846  *      Recalculate dev->features set and send notifications if it
6847  *      has changed. Should be called after driver or hardware dependent
6848  *      conditions might have changed that influence the features.
6849  */
6850 void netdev_update_features(struct net_device *dev)
6851 {
6852         if (__netdev_update_features(dev))
6853                 netdev_features_change(dev);
6854 }
6855 EXPORT_SYMBOL(netdev_update_features);
6856
6857 /**
6858  *      netdev_change_features - recalculate device features
6859  *      @dev: the device to check
6860  *
6861  *      Recalculate dev->features set and send notifications even
6862  *      if they have not changed. Should be called instead of
6863  *      netdev_update_features() if also dev->vlan_features might
6864  *      have changed to allow the changes to be propagated to stacked
6865  *      VLAN devices.
6866  */
6867 void netdev_change_features(struct net_device *dev)
6868 {
6869         __netdev_update_features(dev);
6870         netdev_features_change(dev);
6871 }
6872 EXPORT_SYMBOL(netdev_change_features);
6873
6874 /**
6875  *      netif_stacked_transfer_operstate -      transfer operstate
6876  *      @rootdev: the root or lower level device to transfer state from
6877  *      @dev: the device to transfer operstate to
6878  *
6879  *      Transfer operational state from root to device. This is normally
6880  *      called when a stacking relationship exists between the root
6881  *      device and the device(a leaf device).
6882  */
6883 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
6884                                         struct net_device *dev)
6885 {
6886         if (rootdev->operstate == IF_OPER_DORMANT)
6887                 netif_dormant_on(dev);
6888         else
6889                 netif_dormant_off(dev);
6890
6891         if (netif_carrier_ok(rootdev)) {
6892                 if (!netif_carrier_ok(dev))
6893                         netif_carrier_on(dev);
6894         } else {
6895                 if (netif_carrier_ok(dev))
6896                         netif_carrier_off(dev);
6897         }
6898 }
6899 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
6900
6901 #ifdef CONFIG_SYSFS
6902 static int netif_alloc_rx_queues(struct net_device *dev)
6903 {
6904         unsigned int i, count = dev->num_rx_queues;
6905         struct netdev_rx_queue *rx;
6906         size_t sz = count * sizeof(*rx);
6907
6908         BUG_ON(count < 1);
6909
6910         rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6911         if (!rx) {
6912                 rx = vzalloc(sz);
6913                 if (!rx)
6914                         return -ENOMEM;
6915         }
6916         dev->_rx = rx;
6917
6918         for (i = 0; i < count; i++)
6919                 rx[i].dev = dev;
6920         return 0;
6921 }
6922 #endif
6923
6924 static void netdev_init_one_queue(struct net_device *dev,
6925                                   struct netdev_queue *queue, void *_unused)
6926 {
6927         /* Initialize queue lock */
6928         spin_lock_init(&queue->_xmit_lock);
6929         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
6930         queue->xmit_lock_owner = -1;
6931         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
6932         queue->dev = dev;
6933 #ifdef CONFIG_BQL
6934         dql_init(&queue->dql, HZ);
6935 #endif
6936 }
6937
6938 static void netif_free_tx_queues(struct net_device *dev)
6939 {
6940         kvfree(dev->_tx);
6941 }
6942
6943 static int netif_alloc_netdev_queues(struct net_device *dev)
6944 {
6945         unsigned int count = dev->num_tx_queues;
6946         struct netdev_queue *tx;
6947         size_t sz = count * sizeof(*tx);
6948
6949         if (count < 1 || count > 0xffff)
6950                 return -EINVAL;
6951
6952         tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
6953         if (!tx) {
6954                 tx = vzalloc(sz);
6955                 if (!tx)
6956                         return -ENOMEM;
6957         }
6958         dev->_tx = tx;
6959
6960         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
6961         spin_lock_init(&dev->tx_global_lock);
6962
6963         return 0;
6964 }
6965
6966 void netif_tx_stop_all_queues(struct net_device *dev)
6967 {
6968         unsigned int i;
6969
6970         for (i = 0; i < dev->num_tx_queues; i++) {
6971                 struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
6972                 netif_tx_stop_queue(txq);
6973         }
6974 }
6975 EXPORT_SYMBOL(netif_tx_stop_all_queues);
6976
6977 /**
6978  *      register_netdevice      - register a network device
6979  *      @dev: device to register
6980  *
6981  *      Take a completed network device structure and add it to the kernel
6982  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
6983  *      chain. 0 is returned on success. A negative errno code is returned
6984  *      on a failure to set up the device, or if the name is a duplicate.
6985  *
6986  *      Callers must hold the rtnl semaphore. You may want
6987  *      register_netdev() instead of this.
6988  *
6989  *      BUGS:
6990  *      The locking appears insufficient to guarantee two parallel registers
6991  *      will not get the same name.
6992  */
6993
6994 int register_netdevice(struct net_device *dev)
6995 {
6996         int ret;
6997         struct net *net = dev_net(dev);
6998
6999         BUG_ON(dev_boot_phase);
7000         ASSERT_RTNL();
7001
7002         might_sleep();
7003
7004         /* When net_device's are persistent, this will be fatal. */
7005         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
7006         BUG_ON(!net);
7007
7008         spin_lock_init(&dev->addr_list_lock);
7009         netdev_set_addr_lockdep_class(dev);
7010
7011         ret = dev_get_valid_name(net, dev, dev->name);
7012         if (ret < 0)
7013                 goto out;
7014
7015         /* Init, if this function is available */
7016         if (dev->netdev_ops->ndo_init) {
7017                 ret = dev->netdev_ops->ndo_init(dev);
7018                 if (ret) {
7019                         if (ret > 0)
7020                                 ret = -EIO;
7021                         goto out;
7022                 }
7023         }
7024
7025         if (((dev->hw_features | dev->features) &
7026              NETIF_F_HW_VLAN_CTAG_FILTER) &&
7027             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
7028              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
7029                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
7030                 ret = -EINVAL;
7031                 goto err_uninit;
7032         }
7033
7034         ret = -EBUSY;
7035         if (!dev->ifindex)
7036                 dev->ifindex = dev_new_index(net);
7037         else if (__dev_get_by_index(net, dev->ifindex))
7038                 goto err_uninit;
7039
7040         /* Transfer changeable features to wanted_features and enable
7041          * software offloads (GSO and GRO).
7042          */
7043         dev->hw_features |= NETIF_F_SOFT_FEATURES;
7044         dev->features |= NETIF_F_SOFT_FEATURES;
7045         dev->wanted_features = dev->features & dev->hw_features;
7046
7047         if (!(dev->flags & IFF_LOOPBACK))
7048                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
7049
7050         /* If IPv4 TCP segmentation offload is supported we should also
7051          * allow the device to enable segmenting the frame with the option
7052          * of ignoring a static IP ID value.  This doesn't enable the
7053          * feature itself but allows the user to enable it later.
7054          */
7055         if (dev->hw_features & NETIF_F_TSO)
7056                 dev->hw_features |= NETIF_F_TSO_MANGLEID;
7057         if (dev->vlan_features & NETIF_F_TSO)
7058                 dev->vlan_features |= NETIF_F_TSO_MANGLEID;
7059         if (dev->mpls_features & NETIF_F_TSO)
7060                 dev->mpls_features |= NETIF_F_TSO_MANGLEID;
7061         if (dev->hw_enc_features & NETIF_F_TSO)
7062                 dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
7063
7064         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
7065          */
7066         dev->vlan_features |= NETIF_F_HIGHDMA;
7067
7068         /* Make NETIF_F_SG inheritable to tunnel devices.
7069          */
7070         dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
7071
7072         /* Make NETIF_F_SG inheritable to MPLS.
7073          */
7074         dev->mpls_features |= NETIF_F_SG;
7075
7076         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
7077         ret = notifier_to_errno(ret);
7078         if (ret)
7079                 goto err_uninit;
7080
7081         ret = netdev_register_kobject(dev);
7082         if (ret)
7083                 goto err_uninit;
7084         dev->reg_state = NETREG_REGISTERED;
7085
7086         __netdev_update_features(dev);
7087
7088         /*
7089          *      Default initial state at registry is that the
7090          *      device is present.
7091          */
7092
7093         set_bit(__LINK_STATE_PRESENT, &dev->state);
7094
7095         linkwatch_init_dev(dev);
7096
7097         dev_init_scheduler(dev);
7098         dev_hold(dev);
7099         list_netdevice(dev);
7100         add_device_randomness(dev->dev_addr, dev->addr_len);
7101
7102         /* If the device has permanent device address, driver should
7103          * set dev_addr and also addr_assign_type should be set to
7104          * NET_ADDR_PERM (default value).
7105          */
7106         if (dev->addr_assign_type == NET_ADDR_PERM)
7107                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
7108
7109         /* Notify protocols, that a new device appeared. */
7110         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
7111         ret = notifier_to_errno(ret);
7112         if (ret) {
7113                 rollback_registered(dev);
7114                 dev->reg_state = NETREG_UNREGISTERED;
7115         }
7116         /*
7117          *      Prevent userspace races by waiting until the network
7118          *      device is fully setup before sending notifications.
7119          */
7120         if (!dev->rtnl_link_ops ||
7121             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
7122                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7123
7124 out:
7125         return ret;
7126
7127 err_uninit:
7128         if (dev->netdev_ops->ndo_uninit)
7129                 dev->netdev_ops->ndo_uninit(dev);
7130         goto out;
7131 }
7132 EXPORT_SYMBOL(register_netdevice);
7133
7134 /**
7135  *      init_dummy_netdev       - init a dummy network device for NAPI
7136  *      @dev: device to init
7137  *
7138  *      This takes a network device structure and initialize the minimum
7139  *      amount of fields so it can be used to schedule NAPI polls without
7140  *      registering a full blown interface. This is to be used by drivers
7141  *      that need to tie several hardware interfaces to a single NAPI
7142  *      poll scheduler due to HW limitations.
7143  */
7144 int init_dummy_netdev(struct net_device *dev)
7145 {
7146         /* Clear everything. Note we don't initialize spinlocks
7147          * are they aren't supposed to be taken by any of the
7148          * NAPI code and this dummy netdev is supposed to be
7149          * only ever used for NAPI polls
7150          */
7151         memset(dev, 0, sizeof(struct net_device));
7152
7153         /* make sure we BUG if trying to hit standard
7154          * register/unregister code path
7155          */
7156         dev->reg_state = NETREG_DUMMY;
7157
7158         /* NAPI wants this */
7159         INIT_LIST_HEAD(&dev->napi_list);
7160
7161         /* a dummy interface is started by default */
7162         set_bit(__LINK_STATE_PRESENT, &dev->state);
7163         set_bit(__LINK_STATE_START, &dev->state);
7164
7165         /* Note : We dont allocate pcpu_refcnt for dummy devices,
7166          * because users of this 'device' dont need to change
7167          * its refcount.
7168          */
7169
7170         return 0;
7171 }
7172 EXPORT_SYMBOL_GPL(init_dummy_netdev);
7173
7174
7175 /**
7176  *      register_netdev - register a network device
7177  *      @dev: device to register
7178  *
7179  *      Take a completed network device structure and add it to the kernel
7180  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
7181  *      chain. 0 is returned on success. A negative errno code is returned
7182  *      on a failure to set up the device, or if the name is a duplicate.
7183  *
7184  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
7185  *      and expands the device name if you passed a format string to
7186  *      alloc_netdev.
7187  */
7188 int register_netdev(struct net_device *dev)
7189 {
7190         int err;
7191
7192         rtnl_lock();
7193         err = register_netdevice(dev);
7194         rtnl_unlock();
7195         return err;
7196 }
7197 EXPORT_SYMBOL(register_netdev);
7198
7199 int netdev_refcnt_read(const struct net_device *dev)
7200 {
7201         int i, refcnt = 0;
7202
7203         for_each_possible_cpu(i)
7204                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
7205         return refcnt;
7206 }
7207 EXPORT_SYMBOL(netdev_refcnt_read);
7208
7209 /**
7210  * netdev_wait_allrefs - wait until all references are gone.
7211  * @dev: target net_device
7212  *
7213  * This is called when unregistering network devices.
7214  *
7215  * Any protocol or device that holds a reference should register
7216  * for netdevice notification, and cleanup and put back the
7217  * reference if they receive an UNREGISTER event.
7218  * We can get stuck here if buggy protocols don't correctly
7219  * call dev_put.
7220  */
7221 static void netdev_wait_allrefs(struct net_device *dev)
7222 {
7223         unsigned long rebroadcast_time, warning_time;
7224         int refcnt;
7225
7226         linkwatch_forget_dev(dev);
7227
7228         rebroadcast_time = warning_time = jiffies;
7229         refcnt = netdev_refcnt_read(dev);
7230
7231         while (refcnt != 0) {
7232                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
7233                         rtnl_lock();
7234
7235                         /* Rebroadcast unregister notification */
7236                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7237
7238                         __rtnl_unlock();
7239                         rcu_barrier();
7240                         rtnl_lock();
7241
7242                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7243                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
7244                                      &dev->state)) {
7245                                 /* We must not have linkwatch events
7246                                  * pending on unregister. If this
7247                                  * happens, we simply run the queue
7248                                  * unscheduled, resulting in a noop
7249                                  * for this device.
7250                                  */
7251                                 linkwatch_run_queue();
7252                         }
7253
7254                         __rtnl_unlock();
7255
7256                         rebroadcast_time = jiffies;
7257                 }
7258
7259                 msleep(250);
7260
7261                 refcnt = netdev_refcnt_read(dev);
7262
7263                 if (time_after(jiffies, warning_time + 10 * HZ)) {
7264                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
7265                                  dev->name, refcnt);
7266                         warning_time = jiffies;
7267                 }
7268         }
7269 }
7270
7271 /* The sequence is:
7272  *
7273  *      rtnl_lock();
7274  *      ...
7275  *      register_netdevice(x1);
7276  *      register_netdevice(x2);
7277  *      ...
7278  *      unregister_netdevice(y1);
7279  *      unregister_netdevice(y2);
7280  *      ...
7281  *      rtnl_unlock();
7282  *      free_netdev(y1);
7283  *      free_netdev(y2);
7284  *
7285  * We are invoked by rtnl_unlock().
7286  * This allows us to deal with problems:
7287  * 1) We can delete sysfs objects which invoke hotplug
7288  *    without deadlocking with linkwatch via keventd.
7289  * 2) Since we run with the RTNL semaphore not held, we can sleep
7290  *    safely in order to wait for the netdev refcnt to drop to zero.
7291  *
7292  * We must not return until all unregister events added during
7293  * the interval the lock was held have been completed.
7294  */
7295 void netdev_run_todo(void)
7296 {
7297         struct list_head list;
7298
7299         /* Snapshot list, allow later requests */
7300         list_replace_init(&net_todo_list, &list);
7301
7302         __rtnl_unlock();
7303
7304
7305         /* Wait for rcu callbacks to finish before next phase */
7306         if (!list_empty(&list))
7307                 rcu_barrier();
7308
7309         while (!list_empty(&list)) {
7310                 struct net_device *dev
7311                         = list_first_entry(&list, struct net_device, todo_list);
7312                 list_del(&dev->todo_list);
7313
7314                 rtnl_lock();
7315                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7316                 __rtnl_unlock();
7317
7318                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7319                         pr_err("network todo '%s' but state %d\n",
7320                                dev->name, dev->reg_state);
7321                         dump_stack();
7322                         continue;
7323                 }
7324
7325                 dev->reg_state = NETREG_UNREGISTERED;
7326
7327                 netdev_wait_allrefs(dev);
7328
7329                 /* paranoia */
7330                 BUG_ON(netdev_refcnt_read(dev));
7331                 BUG_ON(!list_empty(&dev->ptype_all));
7332                 BUG_ON(!list_empty(&dev->ptype_specific));
7333                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
7334                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
7335                 WARN_ON(dev->dn_ptr);
7336
7337                 if (dev->destructor)
7338                         dev->destructor(dev);
7339
7340                 /* Report a network device has been unregistered */
7341                 rtnl_lock();
7342                 dev_net(dev)->dev_unreg_count--;
7343                 __rtnl_unlock();
7344                 wake_up(&netdev_unregistering_wq);
7345
7346                 /* Free network device */
7347                 kobject_put(&dev->dev.kobj);
7348         }
7349 }
7350
7351 /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
7352  * all the same fields in the same order as net_device_stats, with only
7353  * the type differing, but rtnl_link_stats64 may have additional fields
7354  * at the end for newer counters.
7355  */
7356 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
7357                              const struct net_device_stats *netdev_stats)
7358 {
7359 #if BITS_PER_LONG == 64
7360         BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
7361         memcpy(stats64, netdev_stats, sizeof(*stats64));
7362         /* zero out counters that only exist in rtnl_link_stats64 */
7363         memset((char *)stats64 + sizeof(*netdev_stats), 0,
7364                sizeof(*stats64) - sizeof(*netdev_stats));
7365 #else
7366         size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
7367         const unsigned long *src = (const unsigned long *)netdev_stats;
7368         u64 *dst = (u64 *)stats64;
7369
7370         BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
7371         for (i = 0; i < n; i++)
7372                 dst[i] = src[i];
7373         /* zero out counters that only exist in rtnl_link_stats64 */
7374         memset((char *)stats64 + n * sizeof(u64), 0,
7375                sizeof(*stats64) - n * sizeof(u64));
7376 #endif
7377 }
7378 EXPORT_SYMBOL(netdev_stats_to_stats64);
7379
7380 /**
7381  *      dev_get_stats   - get network device statistics
7382  *      @dev: device to get statistics from
7383  *      @storage: place to store stats
7384  *
7385  *      Get network statistics from device. Return @storage.
7386  *      The device driver may provide its own method by setting
7387  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
7388  *      otherwise the internal statistics structure is used.
7389  */
7390 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
7391                                         struct rtnl_link_stats64 *storage)
7392 {
7393         const struct net_device_ops *ops = dev->netdev_ops;
7394
7395         if (ops->ndo_get_stats64) {
7396                 memset(storage, 0, sizeof(*storage));
7397                 ops->ndo_get_stats64(dev, storage);
7398         } else if (ops->ndo_get_stats) {
7399                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
7400         } else {
7401                 netdev_stats_to_stats64(storage, &dev->stats);
7402         }
7403         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
7404         storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
7405         storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
7406         return storage;
7407 }
7408 EXPORT_SYMBOL(dev_get_stats);
7409
7410 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
7411 {
7412         struct netdev_queue *queue = dev_ingress_queue(dev);
7413
7414 #ifdef CONFIG_NET_CLS_ACT
7415         if (queue)
7416                 return queue;
7417         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
7418         if (!queue)
7419                 return NULL;
7420         netdev_init_one_queue(dev, queue, NULL);
7421         RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
7422         queue->qdisc_sleeping = &noop_qdisc;
7423         rcu_assign_pointer(dev->ingress_queue, queue);
7424 #endif
7425         return queue;
7426 }
7427
7428 static const struct ethtool_ops default_ethtool_ops;
7429
7430 void netdev_set_default_ethtool_ops(struct net_device *dev,
7431                                     const struct ethtool_ops *ops)
7432 {
7433         if (dev->ethtool_ops == &default_ethtool_ops)
7434                 dev->ethtool_ops = ops;
7435 }
7436 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
7437
7438 void netdev_freemem(struct net_device *dev)
7439 {
7440         char *addr = (char *)dev - dev->padded;
7441
7442         kvfree(addr);
7443 }
7444
7445 /**
7446  *      alloc_netdev_mqs - allocate network device
7447  *      @sizeof_priv:           size of private data to allocate space for
7448  *      @name:                  device name format string
7449  *      @name_assign_type:      origin of device name
7450  *      @setup:                 callback to initialize device
7451  *      @txqs:                  the number of TX subqueues to allocate
7452  *      @rxqs:                  the number of RX subqueues to allocate
7453  *
7454  *      Allocates a struct net_device with private data area for driver use
7455  *      and performs basic initialization.  Also allocates subqueue structs
7456  *      for each queue on the device.
7457  */
7458 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
7459                 unsigned char name_assign_type,
7460                 void (*setup)(struct net_device *),
7461                 unsigned int txqs, unsigned int rxqs)
7462 {
7463         struct net_device *dev;
7464         size_t alloc_size;
7465         struct net_device *p;
7466
7467         BUG_ON(strlen(name) >= sizeof(dev->name));
7468
7469         if (txqs < 1) {
7470                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
7471                 return NULL;
7472         }
7473
7474 #ifdef CONFIG_SYSFS
7475         if (rxqs < 1) {
7476                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
7477                 return NULL;
7478         }
7479 #endif
7480
7481         alloc_size = sizeof(struct net_device);
7482         if (sizeof_priv) {
7483                 /* ensure 32-byte alignment of private area */
7484                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
7485                 alloc_size += sizeof_priv;
7486         }
7487         /* ensure 32-byte alignment of whole construct */
7488         alloc_size += NETDEV_ALIGN - 1;
7489
7490         p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
7491         if (!p)
7492                 p = vzalloc(alloc_size);
7493         if (!p)
7494                 return NULL;
7495
7496         dev = PTR_ALIGN(p, NETDEV_ALIGN);
7497         dev->padded = (char *)dev - (char *)p;
7498
7499         dev->pcpu_refcnt = alloc_percpu(int);
7500         if (!dev->pcpu_refcnt)
7501                 goto free_dev;
7502
7503         if (dev_addr_init(dev))
7504                 goto free_pcpu;
7505
7506         dev_mc_init(dev);
7507         dev_uc_init(dev);
7508
7509         dev_net_set(dev, &init_net);
7510
7511         dev->gso_max_size = GSO_MAX_SIZE;
7512         dev->gso_max_segs = GSO_MAX_SEGS;
7513
7514         INIT_LIST_HEAD(&dev->napi_list);
7515         INIT_LIST_HEAD(&dev->unreg_list);
7516         INIT_LIST_HEAD(&dev->close_list);
7517         INIT_LIST_HEAD(&dev->link_watch_list);
7518         INIT_LIST_HEAD(&dev->adj_list.upper);
7519         INIT_LIST_HEAD(&dev->adj_list.lower);
7520         INIT_LIST_HEAD(&dev->ptype_all);
7521         INIT_LIST_HEAD(&dev->ptype_specific);
7522 #ifdef CONFIG_NET_SCHED
7523         hash_init(dev->qdisc_hash);
7524 #endif
7525         dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
7526         setup(dev);
7527
7528         if (!dev->tx_queue_len) {
7529                 dev->priv_flags |= IFF_NO_QUEUE;
7530                 dev->tx_queue_len = 1;
7531         }
7532
7533         dev->num_tx_queues = txqs;
7534         dev->real_num_tx_queues = txqs;
7535         if (netif_alloc_netdev_queues(dev))
7536                 goto free_all;
7537
7538 #ifdef CONFIG_SYSFS
7539         dev->num_rx_queues = rxqs;
7540         dev->real_num_rx_queues = rxqs;
7541         if (netif_alloc_rx_queues(dev))
7542                 goto free_all;
7543 #endif
7544
7545         strcpy(dev->name, name);
7546         dev->name_assign_type = name_assign_type;
7547         dev->group = INIT_NETDEV_GROUP;
7548         if (!dev->ethtool_ops)
7549                 dev->ethtool_ops = &default_ethtool_ops;
7550
7551         nf_hook_ingress_init(dev);
7552
7553         return dev;
7554
7555 free_all:
7556         free_netdev(dev);
7557         return NULL;
7558
7559 free_pcpu:
7560         free_percpu(dev->pcpu_refcnt);
7561 free_dev:
7562         netdev_freemem(dev);
7563         return NULL;
7564 }
7565 EXPORT_SYMBOL(alloc_netdev_mqs);
7566
7567 /**
7568  *      free_netdev - free network device
7569  *      @dev: device
7570  *
7571  *      This function does the last stage of destroying an allocated device
7572  *      interface. The reference to the device object is released.
7573  *      If this is the last reference then it will be freed.
7574  *      Must be called in process context.
7575  */
7576 void free_netdev(struct net_device *dev)
7577 {
7578         struct napi_struct *p, *n;
7579
7580         might_sleep();
7581         netif_free_tx_queues(dev);
7582 #ifdef CONFIG_SYSFS
7583         kvfree(dev->_rx);
7584 #endif
7585
7586         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
7587
7588         /* Flush device addresses */
7589         dev_addr_flush(dev);
7590
7591         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
7592                 netif_napi_del(p);
7593
7594         free_percpu(dev->pcpu_refcnt);
7595         dev->pcpu_refcnt = NULL;
7596
7597         /*  Compatibility with error handling in drivers */
7598         if (dev->reg_state == NETREG_UNINITIALIZED) {
7599                 netdev_freemem(dev);
7600                 return;
7601         }
7602
7603         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
7604         dev->reg_state = NETREG_RELEASED;
7605
7606         /* will free via device release */
7607         put_device(&dev->dev);
7608 }
7609 EXPORT_SYMBOL(free_netdev);
7610
7611 /**
7612  *      synchronize_net -  Synchronize with packet receive processing
7613  *
7614  *      Wait for packets currently being received to be done.
7615  *      Does not block later packets from starting.
7616  */
7617 void synchronize_net(void)
7618 {
7619         might_sleep();
7620         if (rtnl_is_locked())
7621                 synchronize_rcu_expedited();
7622         else
7623                 synchronize_rcu();
7624 }
7625 EXPORT_SYMBOL(synchronize_net);
7626
7627 /**
7628  *      unregister_netdevice_queue - remove device from the kernel
7629  *      @dev: device
7630  *      @head: list
7631  *
7632  *      This function shuts down a device interface and removes it
7633  *      from the kernel tables.
7634  *      If head not NULL, device is queued to be unregistered later.
7635  *
7636  *      Callers must hold the rtnl semaphore.  You may want
7637  *      unregister_netdev() instead of this.
7638  */
7639
7640 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
7641 {
7642         ASSERT_RTNL();
7643
7644         if (head) {
7645                 list_move_tail(&dev->unreg_list, head);
7646         } else {
7647                 rollback_registered(dev);
7648                 /* Finish processing unregister after unlock */
7649                 net_set_todo(dev);
7650         }
7651 }
7652 EXPORT_SYMBOL(unregister_netdevice_queue);
7653
7654 /**
7655  *      unregister_netdevice_many - unregister many devices
7656  *      @head: list of devices
7657  *
7658  *  Note: As most callers use a stack allocated list_head,
7659  *  we force a list_del() to make sure stack wont be corrupted later.
7660  */
7661 void unregister_netdevice_many(struct list_head *head)
7662 {
7663         struct net_device *dev;
7664
7665         if (!list_empty(head)) {
7666                 rollback_registered_many(head);
7667                 list_for_each_entry(dev, head, unreg_list)
7668                         net_set_todo(dev);
7669                 list_del(head);
7670         }
7671 }
7672 EXPORT_SYMBOL(unregister_netdevice_many);
7673
7674 /**
7675  *      unregister_netdev - remove device from the kernel
7676  *      @dev: device
7677  *
7678  *      This function shuts down a device interface and removes it
7679  *      from the kernel tables.
7680  *
7681  *      This is just a wrapper for unregister_netdevice that takes
7682  *      the rtnl semaphore.  In general you want to use this and not
7683  *      unregister_netdevice.
7684  */
7685 void unregister_netdev(struct net_device *dev)
7686 {
7687         rtnl_lock();
7688         unregister_netdevice(dev);
7689         rtnl_unlock();
7690 }
7691 EXPORT_SYMBOL(unregister_netdev);
7692
7693 /**
7694  *      dev_change_net_namespace - move device to different nethost namespace
7695  *      @dev: device
7696  *      @net: network namespace
7697  *      @pat: If not NULL name pattern to try if the current device name
7698  *            is already taken in the destination network namespace.
7699  *
7700  *      This function shuts down a device interface and moves it
7701  *      to a new network namespace. On success 0 is returned, on
7702  *      a failure a netagive errno code is returned.
7703  *
7704  *      Callers must hold the rtnl semaphore.
7705  */
7706
7707 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
7708 {
7709         int err;
7710
7711         ASSERT_RTNL();
7712
7713         /* Don't allow namespace local devices to be moved. */
7714         err = -EINVAL;
7715         if (dev->features & NETIF_F_NETNS_LOCAL)
7716                 goto out;
7717
7718         /* Ensure the device has been registrered */
7719         if (dev->reg_state != NETREG_REGISTERED)
7720                 goto out;
7721
7722         /* Get out if there is nothing todo */
7723         err = 0;
7724         if (net_eq(dev_net(dev), net))
7725                 goto out;
7726
7727         /* Pick the destination device name, and ensure
7728          * we can use it in the destination network namespace.
7729          */
7730         err = -EEXIST;
7731         if (__dev_get_by_name(net, dev->name)) {
7732                 /* We get here if we can't use the current device name */
7733                 if (!pat)
7734                         goto out;
7735                 if (dev_get_valid_name(net, dev, pat) < 0)
7736                         goto out;
7737         }
7738
7739         /*
7740          * And now a mini version of register_netdevice unregister_netdevice.
7741          */
7742
7743         /* If device is running close it first. */
7744         dev_close(dev);
7745
7746         /* And unlink it from device chain */
7747         err = -ENODEV;
7748         unlist_netdevice(dev);
7749
7750         synchronize_net();
7751
7752         /* Shutdown queueing discipline. */
7753         dev_shutdown(dev);
7754
7755         /* Notify protocols, that we are about to destroy
7756            this device. They should clean all the things.
7757
7758            Note that dev->reg_state stays at NETREG_REGISTERED.
7759            This is wanted because this way 8021q and macvlan know
7760            the device is just moving and can keep their slaves up.
7761         */
7762         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
7763         rcu_barrier();
7764         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
7765         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U, GFP_KERNEL);
7766
7767         /*
7768          *      Flush the unicast and multicast chains
7769          */
7770         dev_uc_flush(dev);
7771         dev_mc_flush(dev);
7772
7773         /* Send a netdev-removed uevent to the old namespace */
7774         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
7775         netdev_adjacent_del_links(dev);
7776
7777         /* Actually switch the network namespace */
7778         dev_net_set(dev, net);
7779
7780         /* If there is an ifindex conflict assign a new one */
7781         if (__dev_get_by_index(net, dev->ifindex))
7782                 dev->ifindex = dev_new_index(net);
7783
7784         /* Send a netdev-add uevent to the new namespace */
7785         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
7786         netdev_adjacent_add_links(dev);
7787
7788         /* Fixup kobjects */
7789         err = device_rename(&dev->dev, dev->name);
7790         WARN_ON(err);
7791
7792         /* Add the device back in the hashes */
7793         list_netdevice(dev);
7794
7795         /* Notify protocols, that a new device appeared. */
7796         call_netdevice_notifiers(NETDEV_REGISTER, dev);
7797
7798         /*
7799          *      Prevent userspace races by waiting until the network
7800          *      device is fully setup before sending notifications.
7801          */
7802         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
7803
7804         synchronize_net();
7805         err = 0;
7806 out:
7807         return err;
7808 }
7809 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
7810
7811 static int dev_cpu_callback(struct notifier_block *nfb,
7812                             unsigned long action,
7813                             void *ocpu)
7814 {
7815         struct sk_buff **list_skb;
7816         struct sk_buff *skb;
7817         unsigned int cpu, oldcpu = (unsigned long)ocpu;
7818         struct softnet_data *sd, *oldsd;
7819
7820         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
7821                 return NOTIFY_OK;
7822
7823         local_irq_disable();
7824         cpu = smp_processor_id();
7825         sd = &per_cpu(softnet_data, cpu);
7826         oldsd = &per_cpu(softnet_data, oldcpu);
7827
7828         /* Find end of our completion_queue. */
7829         list_skb = &sd->completion_queue;
7830         while (*list_skb)
7831                 list_skb = &(*list_skb)->next;
7832         /* Append completion queue from offline CPU. */
7833         *list_skb = oldsd->completion_queue;
7834         oldsd->completion_queue = NULL;
7835
7836         /* Append output queue from offline CPU. */
7837         if (oldsd->output_queue) {
7838                 *sd->output_queue_tailp = oldsd->output_queue;
7839                 sd->output_queue_tailp = oldsd->output_queue_tailp;
7840                 oldsd->output_queue = NULL;
7841                 oldsd->output_queue_tailp = &oldsd->output_queue;
7842         }
7843         /* Append NAPI poll list from offline CPU, with one exception :
7844          * process_backlog() must be called by cpu owning percpu backlog.
7845          * We properly handle process_queue & input_pkt_queue later.
7846          */
7847         while (!list_empty(&oldsd->poll_list)) {
7848                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
7849                                                             struct napi_struct,
7850                                                             poll_list);
7851
7852                 list_del_init(&napi->poll_list);
7853                 if (napi->poll == process_backlog)
7854                         napi->state = 0;
7855                 else
7856                         ____napi_schedule(sd, napi);
7857         }
7858
7859         raise_softirq_irqoff(NET_TX_SOFTIRQ);
7860         local_irq_enable();
7861
7862         /* Process offline CPU's input_pkt_queue */
7863         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
7864                 netif_rx_ni(skb);
7865                 input_queue_head_incr(oldsd);
7866         }
7867         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
7868                 netif_rx_ni(skb);
7869                 input_queue_head_incr(oldsd);
7870         }
7871
7872         return NOTIFY_OK;
7873 }
7874
7875
7876 /**
7877  *      netdev_increment_features - increment feature set by one
7878  *      @all: current feature set
7879  *      @one: new feature set
7880  *      @mask: mask feature set
7881  *
7882  *      Computes a new feature set after adding a device with feature set
7883  *      @one to the master device with current feature set @all.  Will not
7884  *      enable anything that is off in @mask. Returns the new feature set.
7885  */
7886 netdev_features_t netdev_increment_features(netdev_features_t all,
7887         netdev_features_t one, netdev_features_t mask)
7888 {
7889         if (mask & NETIF_F_HW_CSUM)
7890                 mask |= NETIF_F_CSUM_MASK;
7891         mask |= NETIF_F_VLAN_CHALLENGED;
7892
7893         all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
7894         all &= one | ~NETIF_F_ALL_FOR_ALL;
7895
7896         /* If one device supports hw checksumming, set for all. */
7897         if (all & NETIF_F_HW_CSUM)
7898                 all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
7899
7900         return all;
7901 }
7902 EXPORT_SYMBOL(netdev_increment_features);
7903
7904 static struct hlist_head * __net_init netdev_create_hash(void)
7905 {
7906         int i;
7907         struct hlist_head *hash;
7908
7909         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
7910         if (hash != NULL)
7911                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
7912                         INIT_HLIST_HEAD(&hash[i]);
7913
7914         return hash;
7915 }
7916
7917 /* Initialize per network namespace state */
7918 static int __net_init netdev_init(struct net *net)
7919 {
7920         if (net != &init_net)
7921                 INIT_LIST_HEAD(&net->dev_base_head);
7922
7923         net->dev_name_head = netdev_create_hash();
7924         if (net->dev_name_head == NULL)
7925                 goto err_name;
7926
7927         net->dev_index_head = netdev_create_hash();
7928         if (net->dev_index_head == NULL)
7929                 goto err_idx;
7930
7931         return 0;
7932
7933 err_idx:
7934         kfree(net->dev_name_head);
7935 err_name:
7936         return -ENOMEM;
7937 }
7938
7939 /**
7940  *      netdev_drivername - network driver for the device
7941  *      @dev: network device
7942  *
7943  *      Determine network driver for device.
7944  */
7945 const char *netdev_drivername(const struct net_device *dev)
7946 {
7947         const struct device_driver *driver;
7948         const struct device *parent;
7949         const char *empty = "";
7950
7951         parent = dev->dev.parent;
7952         if (!parent)
7953                 return empty;
7954
7955         driver = parent->driver;
7956         if (driver && driver->name)
7957                 return driver->name;
7958         return empty;
7959 }
7960
7961 static void __netdev_printk(const char *level, const struct net_device *dev,
7962                             struct va_format *vaf)
7963 {
7964         if (dev && dev->dev.parent) {
7965                 dev_printk_emit(level[1] - '0',
7966                                 dev->dev.parent,
7967                                 "%s %s %s%s: %pV",
7968                                 dev_driver_string(dev->dev.parent),
7969                                 dev_name(dev->dev.parent),
7970                                 netdev_name(dev), netdev_reg_state(dev),
7971                                 vaf);
7972         } else if (dev) {
7973                 printk("%s%s%s: %pV",
7974                        level, netdev_name(dev), netdev_reg_state(dev), vaf);
7975         } else {
7976                 printk("%s(NULL net_device): %pV", level, vaf);
7977         }
7978 }
7979
7980 void netdev_printk(const char *level, const struct net_device *dev,
7981                    const char *format, ...)
7982 {
7983         struct va_format vaf;
7984         va_list args;
7985
7986         va_start(args, format);
7987
7988         vaf.fmt = format;
7989         vaf.va = &args;
7990
7991         __netdev_printk(level, dev, &vaf);
7992
7993         va_end(args);
7994 }
7995 EXPORT_SYMBOL(netdev_printk);
7996
7997 #define define_netdev_printk_level(func, level)                 \
7998 void func(const struct net_device *dev, const char *fmt, ...)   \
7999 {                                                               \
8000         struct va_format vaf;                                   \
8001         va_list args;                                           \
8002                                                                 \
8003         va_start(args, fmt);                                    \
8004                                                                 \
8005         vaf.fmt = fmt;                                          \
8006         vaf.va = &args;                                         \
8007                                                                 \
8008         __netdev_printk(level, dev, &vaf);                      \
8009                                                                 \
8010         va_end(args);                                           \
8011 }                                                               \
8012 EXPORT_SYMBOL(func);
8013
8014 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
8015 define_netdev_printk_level(netdev_alert, KERN_ALERT);
8016 define_netdev_printk_level(netdev_crit, KERN_CRIT);
8017 define_netdev_printk_level(netdev_err, KERN_ERR);
8018 define_netdev_printk_level(netdev_warn, KERN_WARNING);
8019 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
8020 define_netdev_printk_level(netdev_info, KERN_INFO);
8021
8022 static void __net_exit netdev_exit(struct net *net)
8023 {
8024         kfree(net->dev_name_head);
8025         kfree(net->dev_index_head);
8026 }
8027
8028 static struct pernet_operations __net_initdata netdev_net_ops = {
8029         .init = netdev_init,
8030         .exit = netdev_exit,
8031 };
8032
8033 static void __net_exit default_device_exit(struct net *net)
8034 {
8035         struct net_device *dev, *aux;
8036         /*
8037          * Push all migratable network devices back to the
8038          * initial network namespace
8039          */
8040         rtnl_lock();
8041         for_each_netdev_safe(net, dev, aux) {
8042                 int err;
8043                 char fb_name[IFNAMSIZ];
8044
8045                 /* Ignore unmoveable devices (i.e. loopback) */
8046                 if (dev->features & NETIF_F_NETNS_LOCAL)
8047                         continue;
8048
8049                 /* Leave virtual devices for the generic cleanup */
8050                 if (dev->rtnl_link_ops)
8051                         continue;
8052
8053                 /* Push remaining network devices to init_net */
8054                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
8055                 err = dev_change_net_namespace(dev, &init_net, fb_name);
8056                 if (err) {
8057                         pr_emerg("%s: failed to move %s to init_net: %d\n",
8058                                  __func__, dev->name, err);
8059                         BUG();
8060                 }
8061         }
8062         rtnl_unlock();
8063 }
8064
8065 static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
8066 {
8067         /* Return with the rtnl_lock held when there are no network
8068          * devices unregistering in any network namespace in net_list.
8069          */
8070         struct net *net;
8071         bool unregistering;
8072         DEFINE_WAIT_FUNC(wait, woken_wake_function);
8073
8074         add_wait_queue(&netdev_unregistering_wq, &wait);
8075         for (;;) {
8076                 unregistering = false;
8077                 rtnl_lock();
8078                 list_for_each_entry(net, net_list, exit_list) {
8079                         if (net->dev_unreg_count > 0) {
8080                                 unregistering = true;
8081                                 break;
8082                         }
8083                 }
8084                 if (!unregistering)
8085                         break;
8086                 __rtnl_unlock();
8087
8088                 wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
8089         }
8090         remove_wait_queue(&netdev_unregistering_wq, &wait);
8091 }
8092
8093 static void __net_exit default_device_exit_batch(struct list_head *net_list)
8094 {
8095         /* At exit all network devices most be removed from a network
8096          * namespace.  Do this in the reverse order of registration.
8097          * Do this across as many network namespaces as possible to
8098          * improve batching efficiency.
8099          */
8100         struct net_device *dev;
8101         struct net *net;
8102         LIST_HEAD(dev_kill_list);
8103
8104         /* To prevent network device cleanup code from dereferencing
8105          * loopback devices or network devices that have been freed
8106          * wait here for all pending unregistrations to complete,
8107          * before unregistring the loopback device and allowing the
8108          * network namespace be freed.
8109          *
8110          * The netdev todo list containing all network devices
8111          * unregistrations that happen in default_device_exit_batch
8112          * will run in the rtnl_unlock() at the end of
8113          * default_device_exit_batch.
8114          */
8115         rtnl_lock_unregistering(net_list);
8116         list_for_each_entry(net, net_list, exit_list) {
8117                 for_each_netdev_reverse(net, dev) {
8118                         if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
8119                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
8120                         else
8121                                 unregister_netdevice_queue(dev, &dev_kill_list);
8122                 }
8123         }
8124         unregister_netdevice_many(&dev_kill_list);
8125         rtnl_unlock();
8126 }
8127
8128 static struct pernet_operations __net_initdata default_device_ops = {
8129         .exit = default_device_exit,
8130         .exit_batch = default_device_exit_batch,
8131 };
8132
8133 /*
8134  *      Initialize the DEV module. At boot time this walks the device list and
8135  *      unhooks any devices that fail to initialise (normally hardware not
8136  *      present) and leaves us with a valid list of present and active devices.
8137  *
8138  */
8139
8140 /*
8141  *       This is called single threaded during boot, so no need
8142  *       to take the rtnl semaphore.
8143  */
8144 static int __init net_dev_init(void)
8145 {
8146         int i, rc = -ENOMEM;
8147
8148         BUG_ON(!dev_boot_phase);
8149
8150         if (dev_proc_init())
8151                 goto out;
8152
8153         if (netdev_kobject_init())
8154                 goto out;
8155
8156         INIT_LIST_HEAD(&ptype_all);
8157         for (i = 0; i < PTYPE_HASH_SIZE; i++)
8158                 INIT_LIST_HEAD(&ptype_base[i]);
8159
8160         INIT_LIST_HEAD(&offload_base);
8161
8162         if (register_pernet_subsys(&netdev_net_ops))
8163                 goto out;
8164
8165         /*
8166          *      Initialise the packet receive queues.
8167          */
8168
8169         for_each_possible_cpu(i) {
8170                 struct work_struct *flush = per_cpu_ptr(&flush_works, i);
8171                 struct softnet_data *sd = &per_cpu(softnet_data, i);
8172
8173                 INIT_WORK(flush, flush_backlog);
8174
8175                 skb_queue_head_init(&sd->input_pkt_queue);
8176                 skb_queue_head_init(&sd->process_queue);
8177                 INIT_LIST_HEAD(&sd->poll_list);
8178                 sd->output_queue_tailp = &sd->output_queue;
8179 #ifdef CONFIG_RPS
8180                 sd->csd.func = rps_trigger_softirq;
8181                 sd->csd.info = sd;
8182                 sd->cpu = i;
8183 #endif
8184
8185                 sd->backlog.poll = process_backlog;
8186                 sd->backlog.weight = weight_p;
8187         }
8188
8189         dev_boot_phase = 0;
8190
8191         /* The loopback device is special if any other network devices
8192          * is present in a network namespace the loopback device must
8193          * be present. Since we now dynamically allocate and free the
8194          * loopback device ensure this invariant is maintained by
8195          * keeping the loopback device as the first device on the
8196          * list of network devices.  Ensuring the loopback devices
8197          * is the first device that appears and the last network device
8198          * that disappears.
8199          */
8200         if (register_pernet_device(&loopback_net_ops))
8201                 goto out;
8202
8203         if (register_pernet_device(&default_device_ops))
8204                 goto out;
8205
8206         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
8207         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
8208
8209         hotcpu_notifier(dev_cpu_callback, 0);
8210         dst_subsys_init();
8211         rc = 0;
8212 out:
8213         return rc;
8214 }
8215
8216 subsys_initcall(net_dev_init);