Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm
[linux-2.6-microblaze.git] / net / core / dev.c
index 1baab07..8c6c084 100644 (file)
@@ -216,18 +216,38 @@ static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
        return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 }
 
-static inline void rps_lock(struct softnet_data *sd)
+static inline void rps_lock_irqsave(struct softnet_data *sd,
+                                   unsigned long *flags)
 {
-#ifdef CONFIG_RPS
-       spin_lock(&sd->input_pkt_queue.lock);
-#endif
+       if (IS_ENABLED(CONFIG_RPS))
+               spin_lock_irqsave(&sd->input_pkt_queue.lock, *flags);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_save(*flags);
 }
 
-static inline void rps_unlock(struct softnet_data *sd)
+static inline void rps_lock_irq_disable(struct softnet_data *sd)
 {
-#ifdef CONFIG_RPS
-       spin_unlock(&sd->input_pkt_queue.lock);
-#endif
+       if (IS_ENABLED(CONFIG_RPS))
+               spin_lock_irq(&sd->input_pkt_queue.lock);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_disable();
+}
+
+static inline void rps_unlock_irq_restore(struct softnet_data *sd,
+                                         unsigned long *flags)
+{
+       if (IS_ENABLED(CONFIG_RPS))
+               spin_unlock_irqrestore(&sd->input_pkt_queue.lock, *flags);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_restore(*flags);
+}
+
+static inline void rps_unlock_irq_enable(struct softnet_data *sd)
+{
+       if (IS_ENABLED(CONFIG_RPS))
+               spin_unlock_irq(&sd->input_pkt_queue.lock);
+       else if (!IS_ENABLED(CONFIG_PREEMPT_RT))
+               local_irq_enable();
 }
 
 static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
@@ -320,7 +340,6 @@ int netdev_name_node_alt_create(struct net_device *dev, const char *name)
 
        return 0;
 }
-EXPORT_SYMBOL(netdev_name_node_alt_create);
 
 static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
 {
@@ -348,7 +367,6 @@ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
 
        return 0;
 }
-EXPORT_SYMBOL(netdev_name_node_alt_destroy);
 
 static void netdev_name_node_alt_flush(struct net_device *dev)
 {
@@ -1037,7 +1055,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
                                /*  avoid cases where sscanf is not exact inverse of printf */
                                snprintf(buf, IFNAMSIZ, name, i);
                                if (!strncmp(buf, name_node->name, IFNAMSIZ))
-                                       set_bit(i, inuse);
+                                       __set_bit(i, inuse);
                        }
                        if (!sscanf(d->name, name, &i))
                                continue;
@@ -1047,7 +1065,7 @@ static int __dev_alloc_name(struct net *net, const char *name, char *buf)
                        /*  avoid cases where sscanf is not exact inverse of printf */
                        snprintf(buf, IFNAMSIZ, name, i);
                        if (!strncmp(buf, d->name, IFNAMSIZ))
-                               set_bit(i, inuse);
+                               __set_bit(i, inuse);
                }
 
                i = find_first_zero_bit(inuse, max_netdevices);
@@ -1602,7 +1620,8 @@ const char *netdev_cmd_to_name(enum netdev_cmd cmd)
        N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
        N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
        N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
-       N(PRE_CHANGEADDR)
+       N(PRE_CHANGEADDR) N(OFFLOAD_XSTATS_ENABLE) N(OFFLOAD_XSTATS_DISABLE)
+       N(OFFLOAD_XSTATS_REPORT_USED) N(OFFLOAD_XSTATS_REPORT_DELTA)
        }
 #undef N
        return "UNKNOWN_NETDEV_EVENT";
@@ -1919,6 +1938,32 @@ static int call_netdevice_notifiers_info(unsigned long val,
        return raw_notifier_call_chain(&netdev_chain, val, info);
 }
 
+/**
+ *     call_netdevice_notifiers_info_robust - call per-netns notifier blocks
+ *                                            for and rollback on error
+ *     @val_up: value passed unmodified to notifier function
+ *     @val_down: value passed unmodified to the notifier function when
+ *                recovering from an error on @val_up
+ *     @info: notifier information data
+ *
+ *     Call all per-netns network notifier blocks, but not notifier blocks on
+ *     the global notifier chain. Parameters and return value are as for
+ *     raw_notifier_call_chain_robust().
+ */
+
+static int
+call_netdevice_notifiers_info_robust(unsigned long val_up,
+                                    unsigned long val_down,
+                                    struct netdev_notifier_info *info)
+{
+       struct net *net = dev_net(info->dev);
+
+       ASSERT_RTNL();
+
+       return raw_notifier_call_chain_robust(&net->netdev_chain,
+                                             val_up, val_down, info);
+}
+
 static int call_netdevice_notifiers_extack(unsigned long val,
                                           struct net_device *dev,
                                           struct netlink_ext_ack *extack)
@@ -2000,7 +2045,8 @@ void net_dec_egress_queue(void)
 EXPORT_SYMBOL_GPL(net_dec_egress_queue);
 #endif
 
-static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
+EXPORT_SYMBOL(netstamp_needed_key);
 #ifdef CONFIG_JUMP_LABEL
 static atomic_t netstamp_needed_deferred;
 static atomic_t netstamp_wanted;
@@ -2061,14 +2107,15 @@ EXPORT_SYMBOL(net_disable_timestamp);
 static inline void net_timestamp_set(struct sk_buff *skb)
 {
        skb->tstamp = 0;
+       skb->mono_delivery_time = 0;
        if (static_branch_unlikely(&netstamp_needed_key))
-               __net_timestamp(skb);
+               skb->tstamp = ktime_get_real();
 }
 
 #define net_timestamp_check(COND, SKB)                         \
        if (static_branch_unlikely(&netstamp_needed_key)) {     \
                if ((COND) && !(SKB)->tstamp)                   \
-                       __net_timestamp(SKB);                   \
+                       (SKB)->tstamp = ktime_get_real();       \
        }                                                       \
 
 bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
@@ -2943,13 +2990,25 @@ EXPORT_SYMBOL(netif_set_real_num_queues);
 /**
  * netif_get_num_default_rss_queues - default number of RSS queues
  *
- * This routine should set an upper limit on the number of RSS queues
- * used by default by multiqueue devices.
+ * Default value is the number of physical cores if there are only 1 or 2, or
+ * divided by 2 if there are more.
  */
 int netif_get_num_default_rss_queues(void)
 {
-       return is_kdump_kernel() ?
-               1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
+       cpumask_var_t cpus;
+       int cpu, count = 0;
+
+       if (unlikely(is_kdump_kernel() || !zalloc_cpumask_var(&cpus, GFP_KERNEL)))
+               return 1;
+
+       cpumask_copy(cpus, cpu_online_mask);
+       for_each_cpu(cpu, cpus) {
+               ++count;
+               cpumask_andnot(cpus, cpus, topology_sibling_cpumask(cpu));
+       }
+       free_cpumask_var(cpus);
+
+       return count > 2 ? DIV_ROUND_UP(count, 2) : count;
 }
 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
 
@@ -3586,7 +3645,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
 out_kfree_skb:
        kfree_skb(skb);
 out_null:
-       atomic_long_inc(&dev->tx_dropped);
+       dev_core_stats_tx_dropped_inc(dev);
        return NULL;
 }
 
@@ -3710,7 +3769,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
 
 no_lock_out:
                if (unlikely(to_free))
-                       kfree_skb_list(to_free);
+                       kfree_skb_list_reason(to_free,
+                                             SKB_DROP_REASON_QDISC_DROP);
                return rc;
        }
 
@@ -3765,7 +3825,7 @@ no_lock_out:
        }
        spin_unlock(root_lock);
        if (unlikely(to_free))
-               kfree_skb_list(to_free);
+               kfree_skb_list_reason(to_free, SKB_DROP_REASON_QDISC_DROP);
        if (unlikely(contended))
                spin_unlock(&q->busylock);
        return rc;
@@ -3811,7 +3871,7 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
                skb->ip_summed = CHECKSUM_UNNECESSARY;
        WARN_ON(!skb_dst(skb));
        skb_dst_force(skb);
-       netif_rx_ni(skb);
+       netif_rx(skb);
        return 0;
 }
 EXPORT_SYMBOL(dev_loopback_xmit);
@@ -3840,7 +3900,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
        case TC_ACT_SHOT:
                mini_qdisc_qstats_cpu_drop(miniq);
                *ret = NET_XMIT_DROP;
-               kfree_skb(skb);
+               kfree_skb_reason(skb, SKB_DROP_REASON_TC_EGRESS);
                return NULL;
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
@@ -4136,7 +4196,7 @@ recursion_alert:
        rc = -ENETDOWN;
        rcu_read_unlock_bh();
 
-       atomic_long_inc(&dev->tx_dropped);
+       dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return rc;
 out:
@@ -4188,7 +4248,7 @@ int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
        local_bh_enable();
        return ret;
 drop:
-       atomic_long_inc(&dev->tx_dropped);
+       dev_core_stats_tx_dropped_inc(dev);
        kfree_skb_list(skb);
        return NET_XMIT_DROP;
 }
@@ -4217,6 +4277,8 @@ static inline void ____napi_schedule(struct softnet_data *sd,
 {
        struct task_struct *thread;
 
+       lockdep_assert_irqs_disabled();
+
        if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
                /* Paired with smp_mb__before_atomic() in
                 * napi_enable()/dev_set_threaded().
@@ -4456,11 +4518,11 @@ static void rps_trigger_softirq(void *data)
  * If yes, queue it to our IPI list and return 1
  * If no, return 0
  */
-static int rps_ipi_queued(struct softnet_data *sd)
+static int napi_schedule_rps(struct softnet_data *sd)
 {
-#ifdef CONFIG_RPS
        struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
 
+#ifdef CONFIG_RPS
        if (sd != mysd) {
                sd->rps_ipi_next = mysd->rps_ipi_list;
                mysd->rps_ipi_list = sd;
@@ -4469,6 +4531,7 @@ static int rps_ipi_queued(struct softnet_data *sd)
                return 1;
        }
 #endif /* CONFIG_RPS */
+       __napi_schedule_irqoff(&mysd->backlog);
        return 0;
 }
 
@@ -4519,15 +4582,15 @@ static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
                              unsigned int *qtail)
 {
+       enum skb_drop_reason reason;
        struct softnet_data *sd;
        unsigned long flags;
        unsigned int qlen;
 
+       reason = SKB_DROP_REASON_NOT_SPECIFIED;
        sd = &per_cpu(softnet_data, cpu);
 
-       local_irq_save(flags);
-
-       rps_lock(sd);
+       rps_lock_irqsave(sd, &flags);
        if (!netif_running(skb->dev))
                goto drop;
        qlen = skb_queue_len(&sd->input_pkt_queue);
@@ -4536,29 +4599,25 @@ static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
 enqueue:
                        __skb_queue_tail(&sd->input_pkt_queue, skb);
                        input_queue_tail_incr_save(sd, qtail);
-                       rps_unlock(sd);
-                       local_irq_restore(flags);
+                       rps_unlock_irq_restore(sd, &flags);
                        return NET_RX_SUCCESS;
                }
 
                /* Schedule NAPI for backlog device
                 * We can use non atomic operation since we own the queue lock
                 */
-               if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
-                       if (!rps_ipi_queued(sd))
-                               ____napi_schedule(sd, &sd->backlog);
-               }
+               if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state))
+                       napi_schedule_rps(sd);
                goto enqueue;
        }
+       reason = SKB_DROP_REASON_CPU_BACKLOG;
 
 drop:
        sd->dropped++;
-       rps_unlock(sd);
-
-       local_irq_restore(flags);
+       rps_unlock_irq_restore(sd, &flags);
 
-       atomic_long_inc(&skb->dev->rx_dropped);
-       kfree_skb(skb);
+       dev_core_stats_rx_dropped_inc(skb->dev);
+       kfree_skb_reason(skb, reason);
        return NET_RX_DROP;
 }
 
@@ -4778,7 +4837,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
        }
        return XDP_PASS;
 out_redir:
-       kfree_skb(skb);
+       kfree_skb_reason(skb, SKB_DROP_REASON_XDP);
        return XDP_DROP;
 }
 EXPORT_SYMBOL_GPL(do_xdp_generic);
@@ -4796,7 +4855,6 @@ static int netif_rx_internal(struct sk_buff *skb)
                struct rps_dev_flow voidflow, *rflow = &voidflow;
                int cpu;
 
-               preempt_disable();
                rcu_read_lock();
 
                cpu = get_rps_cpu(skb->dev, skb, &rflow);
@@ -4806,78 +4864,72 @@ static int netif_rx_internal(struct sk_buff *skb)
                ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
 
                rcu_read_unlock();
-               preempt_enable();
        } else
 #endif
        {
                unsigned int qtail;
 
-               ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
-               put_cpu();
+               ret = enqueue_to_backlog(skb, smp_processor_id(), &qtail);
        }
        return ret;
 }
 
+/**
+ *     __netif_rx      -       Slightly optimized version of netif_rx
+ *     @skb: buffer to post
+ *
+ *     This behaves as netif_rx except that it does not disable bottom halves.
+ *     As a result this function may only be invoked from the interrupt context
+ *     (either hard or soft interrupt).
+ */
+int __netif_rx(struct sk_buff *skb)
+{
+       int ret;
+
+       lockdep_assert_once(hardirq_count() | softirq_count());
+
+       trace_netif_rx_entry(skb);
+       ret = netif_rx_internal(skb);
+       trace_netif_rx_exit(ret);
+       return ret;
+}
+EXPORT_SYMBOL(__netif_rx);
+
 /**
  *     netif_rx        -       post buffer to the network code
  *     @skb: buffer to post
  *
  *     This function receives a packet from a device driver and queues it for
- *     the upper (protocol) levels to process.  It always succeeds. The buffer
- *     may be dropped during processing for congestion control or by the
- *     protocol layers.
+ *     the upper (protocol) levels to process via the backlog NAPI device. It
+ *     always succeeds. The buffer may be dropped during processing for
+ *     congestion control or by the protocol layers.
+ *     The network buffer is passed via the backlog NAPI device. Modern NIC
+ *     driver should use NAPI and GRO.
+ *     This function can used from interrupt and from process context. The
+ *     caller from process context must not disable interrupts before invoking
+ *     this function.
  *
  *     return values:
  *     NET_RX_SUCCESS  (no congestion)
  *     NET_RX_DROP     (packet was dropped)
  *
  */
-
 int netif_rx(struct sk_buff *skb)
 {
+       bool need_bh_off = !(hardirq_count() | softirq_count());
        int ret;
 
+       if (need_bh_off)
+               local_bh_disable();
        trace_netif_rx_entry(skb);
-
        ret = netif_rx_internal(skb);
        trace_netif_rx_exit(ret);
-
+       if (need_bh_off)
+               local_bh_enable();
        return ret;
 }
 EXPORT_SYMBOL(netif_rx);
 
-int netif_rx_ni(struct sk_buff *skb)
-{
-       int err;
-
-       trace_netif_rx_ni_entry(skb);
-
-       preempt_disable();
-       err = netif_rx_internal(skb);
-       if (local_softirq_pending())
-               do_softirq();
-       preempt_enable();
-       trace_netif_rx_ni_exit(err);
-
-       return err;
-}
-EXPORT_SYMBOL(netif_rx_ni);
-
-int netif_rx_any_context(struct sk_buff *skb)
-{
-       /*
-        * If invoked from contexts which do not invoke bottom half
-        * processing either at return from interrupt or when softrqs are
-        * reenabled, use netif_rx_ni() which invokes bottomhalf processing
-        * directly.
-        */
-       if (in_interrupt())
-               return netif_rx(skb);
-       else
-               return netif_rx_ni(skb);
-}
-EXPORT_SYMBOL(netif_rx_any_context);
-
 static __latent_entropy void net_tx_action(struct softirq_action *h)
 {
        struct softnet_data *sd = this_cpu_ptr(&softnet_data);
@@ -5001,7 +5053,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
                break;
        case TC_ACT_SHOT:
                mini_qdisc_qstats_cpu_drop(miniq);
-               kfree_skb(skb);
+               kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS);
                return NULL;
        case TC_ACT_STOLEN:
        case TC_ACT_QUEUED:
@@ -5318,11 +5370,13 @@ check_vlan_id:
                *ppt_prev = pt_prev;
        } else {
 drop:
-               if (!deliver_exact)
-                       atomic_long_inc(&skb->dev->rx_dropped);
-               else
-                       atomic_long_inc(&skb->dev->rx_nohandler);
-               kfree_skb(skb);
+               if (!deliver_exact) {
+                       dev_core_stats_rx_dropped_inc(skb->dev);
+                       kfree_skb_reason(skb, SKB_DROP_REASON_PTYPE_ABSENT);
+               } else {
+                       dev_core_stats_rx_nohandler_inc(skb->dev);
+                       kfree_skb(skb);
+               }
                /* Jamal, now you will not able to escape explaining
                 * me how you were going to use this. :-)
                 */
@@ -5650,8 +5704,7 @@ static void flush_backlog(struct work_struct *work)
        local_bh_disable();
        sd = this_cpu_ptr(&softnet_data);
 
-       local_irq_disable();
-       rps_lock(sd);
+       rps_lock_irq_disable(sd);
        skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
                        __skb_unlink(skb, &sd->input_pkt_queue);
@@ -5659,8 +5712,7 @@ static void flush_backlog(struct work_struct *work)
                        input_queue_head_incr(sd);
                }
        }
-       rps_unlock(sd);
-       local_irq_enable();
+       rps_unlock_irq_enable(sd);
 
        skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
                if (skb->dev->reg_state == NETREG_UNREGISTERING) {
@@ -5678,16 +5730,14 @@ static bool flush_required(int cpu)
        struct softnet_data *sd = &per_cpu(softnet_data, cpu);
        bool do_flush;
 
-       local_irq_disable();
-       rps_lock(sd);
+       rps_lock_irq_disable(sd);
 
        /* as insertion into process_queue happens with the rps lock held,
         * process_queue access may race only with dequeue
         */
        do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
                   !skb_queue_empty_lockless(&sd->process_queue);
-       rps_unlock(sd);
-       local_irq_enable();
+       rps_unlock_irq_enable(sd);
 
        return do_flush;
 #endif
@@ -5802,8 +5852,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
 
                }
 
-               local_irq_disable();
-               rps_lock(sd);
+               rps_lock_irq_disable(sd);
                if (skb_queue_empty(&sd->input_pkt_queue)) {
                        /*
                         * Inline a custom version of __napi_complete().
@@ -5819,8 +5868,7 @@ static int process_backlog(struct napi_struct *napi, int quota)
                        skb_queue_splice_tail_init(&sd->input_pkt_queue,
                                                   &sd->process_queue);
                }
-               rps_unlock(sd);
-               local_irq_enable();
+               rps_unlock_irq_enable(sd);
        }
 
        return work;
@@ -7145,6 +7193,16 @@ static int __netdev_update_upper_level(struct net_device *dev,
        return 0;
 }
 
+#ifdef CONFIG_LOCKDEP
+static LIST_HEAD(net_unlink_list);
+
+static void net_unlink_todo(struct net_device *dev)
+{
+       if (list_empty(&dev->unlink_list))
+               list_add_tail(&dev->unlink_list, &net_unlink_list);
+}
+#endif
+
 static int __netdev_update_lower_level(struct net_device *dev,
                                       struct netdev_nested_priv *priv)
 {
@@ -7727,6 +7785,242 @@ void netdev_bonding_info_change(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
 
+static int netdev_offload_xstats_enable_l3(struct net_device *dev,
+                                          struct netlink_ext_ack *extack)
+{
+       struct netdev_notifier_offload_xstats_info info = {
+               .info.dev = dev,
+               .info.extack = extack,
+               .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+       };
+       int err;
+       int rc;
+
+       dev->offload_xstats_l3 = kzalloc(sizeof(*dev->offload_xstats_l3),
+                                        GFP_KERNEL);
+       if (!dev->offload_xstats_l3)
+               return -ENOMEM;
+
+       rc = call_netdevice_notifiers_info_robust(NETDEV_OFFLOAD_XSTATS_ENABLE,
+                                                 NETDEV_OFFLOAD_XSTATS_DISABLE,
+                                                 &info.info);
+       err = notifier_to_errno(rc);
+       if (err)
+               goto free_stats;
+
+       return 0;
+
+free_stats:
+       kfree(dev->offload_xstats_l3);
+       dev->offload_xstats_l3 = NULL;
+       return err;
+}
+
+int netdev_offload_xstats_enable(struct net_device *dev,
+                                enum netdev_offload_xstats_type type,
+                                struct netlink_ext_ack *extack)
+{
+       ASSERT_RTNL();
+
+       if (netdev_offload_xstats_enabled(dev, type))
+               return -EALREADY;
+
+       switch (type) {
+       case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+               return netdev_offload_xstats_enable_l3(dev, extack);
+       }
+
+       WARN_ON(1);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enable);
+
+static void netdev_offload_xstats_disable_l3(struct net_device *dev)
+{
+       struct netdev_notifier_offload_xstats_info info = {
+               .info.dev = dev,
+               .type = NETDEV_OFFLOAD_XSTATS_TYPE_L3,
+       };
+
+       call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_DISABLE,
+                                     &info.info);
+       kfree(dev->offload_xstats_l3);
+       dev->offload_xstats_l3 = NULL;
+}
+
+int netdev_offload_xstats_disable(struct net_device *dev,
+                                 enum netdev_offload_xstats_type type)
+{
+       ASSERT_RTNL();
+
+       if (!netdev_offload_xstats_enabled(dev, type))
+               return -EALREADY;
+
+       switch (type) {
+       case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+               netdev_offload_xstats_disable_l3(dev);
+               return 0;
+       }
+
+       WARN_ON(1);
+       return -EINVAL;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_disable);
+
+static void netdev_offload_xstats_disable_all(struct net_device *dev)
+{
+       netdev_offload_xstats_disable(dev, NETDEV_OFFLOAD_XSTATS_TYPE_L3);
+}
+
+static struct rtnl_hw_stats64 *
+netdev_offload_xstats_get_ptr(const struct net_device *dev,
+                             enum netdev_offload_xstats_type type)
+{
+       switch (type) {
+       case NETDEV_OFFLOAD_XSTATS_TYPE_L3:
+               return dev->offload_xstats_l3;
+       }
+
+       WARN_ON(1);
+       return NULL;
+}
+
+bool netdev_offload_xstats_enabled(const struct net_device *dev,
+                                  enum netdev_offload_xstats_type type)
+{
+       ASSERT_RTNL();
+
+       return netdev_offload_xstats_get_ptr(dev, type);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_enabled);
+
+struct netdev_notifier_offload_xstats_ru {
+       bool used;
+};
+
+struct netdev_notifier_offload_xstats_rd {
+       struct rtnl_hw_stats64 stats;
+       bool used;
+};
+
+static void netdev_hw_stats64_add(struct rtnl_hw_stats64 *dest,
+                                 const struct rtnl_hw_stats64 *src)
+{
+       dest->rx_packets          += src->rx_packets;
+       dest->tx_packets          += src->tx_packets;
+       dest->rx_bytes            += src->rx_bytes;
+       dest->tx_bytes            += src->tx_bytes;
+       dest->rx_errors           += src->rx_errors;
+       dest->tx_errors           += src->tx_errors;
+       dest->rx_dropped          += src->rx_dropped;
+       dest->tx_dropped          += src->tx_dropped;
+       dest->multicast           += src->multicast;
+}
+
+static int netdev_offload_xstats_get_used(struct net_device *dev,
+                                         enum netdev_offload_xstats_type type,
+                                         bool *p_used,
+                                         struct netlink_ext_ack *extack)
+{
+       struct netdev_notifier_offload_xstats_ru report_used = {};
+       struct netdev_notifier_offload_xstats_info info = {
+               .info.dev = dev,
+               .info.extack = extack,
+               .type = type,
+               .report_used = &report_used,
+       };
+       int rc;
+
+       WARN_ON(!netdev_offload_xstats_enabled(dev, type));
+       rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_USED,
+                                          &info.info);
+       *p_used = report_used.used;
+       return notifier_to_errno(rc);
+}
+
+static int netdev_offload_xstats_get_stats(struct net_device *dev,
+                                          enum netdev_offload_xstats_type type,
+                                          struct rtnl_hw_stats64 *p_stats,
+                                          bool *p_used,
+                                          struct netlink_ext_ack *extack)
+{
+       struct netdev_notifier_offload_xstats_rd report_delta = {};
+       struct netdev_notifier_offload_xstats_info info = {
+               .info.dev = dev,
+               .info.extack = extack,
+               .type = type,
+               .report_delta = &report_delta,
+       };
+       struct rtnl_hw_stats64 *stats;
+       int rc;
+
+       stats = netdev_offload_xstats_get_ptr(dev, type);
+       if (WARN_ON(!stats))
+               return -EINVAL;
+
+       rc = call_netdevice_notifiers_info(NETDEV_OFFLOAD_XSTATS_REPORT_DELTA,
+                                          &info.info);
+
+       /* Cache whatever we got, even if there was an error, otherwise the
+        * successful stats retrievals would get lost.
+        */
+       netdev_hw_stats64_add(stats, &report_delta.stats);
+
+       if (p_stats)
+               *p_stats = *stats;
+       *p_used = report_delta.used;
+
+       return notifier_to_errno(rc);
+}
+
+int netdev_offload_xstats_get(struct net_device *dev,
+                             enum netdev_offload_xstats_type type,
+                             struct rtnl_hw_stats64 *p_stats, bool *p_used,
+                             struct netlink_ext_ack *extack)
+{
+       ASSERT_RTNL();
+
+       if (p_stats)
+               return netdev_offload_xstats_get_stats(dev, type, p_stats,
+                                                      p_used, extack);
+       else
+               return netdev_offload_xstats_get_used(dev, type, p_used,
+                                                     extack);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_get);
+
+void
+netdev_offload_xstats_report_delta(struct netdev_notifier_offload_xstats_rd *report_delta,
+                                  const struct rtnl_hw_stats64 *stats)
+{
+       report_delta->used = true;
+       netdev_hw_stats64_add(&report_delta->stats, stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_delta);
+
+void
+netdev_offload_xstats_report_used(struct netdev_notifier_offload_xstats_ru *report_used)
+{
+       report_used->used = true;
+}
+EXPORT_SYMBOL(netdev_offload_xstats_report_used);
+
+void netdev_offload_xstats_push_delta(struct net_device *dev,
+                                     enum netdev_offload_xstats_type type,
+                                     const struct rtnl_hw_stats64 *p_stats)
+{
+       struct rtnl_hw_stats64 *stats;
+
+       ASSERT_RTNL();
+
+       stats = netdev_offload_xstats_get_ptr(dev, type);
+       if (WARN_ON(!stats))
+               return;
+
+       netdev_hw_stats64_add(stats, p_stats);
+}
+EXPORT_SYMBOL(netdev_offload_xstats_push_delta);
+
 /**
  * netdev_get_xmit_slave - Get the xmit slave of master device
  * @dev: device
@@ -9143,7 +9437,7 @@ DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
 static void net_set_todo(struct net_device *dev)
 {
        list_add_tail(&dev->todo_list, &net_todo_list);
-       dev_net(dev)->dev_unreg_count++;
+       atomic_inc(&dev_net(dev)->dev_unreg_count);
 }
 
 static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
@@ -9683,8 +9977,10 @@ int register_netdevice(struct net_device *dev)
        linkwatch_init_dev(dev);
 
        dev_init_scheduler(dev);
-       dev_hold(dev);
+
+       dev_hold_track(dev, &dev->dev_registered_tracker, GFP_KERNEL);
        list_netdevice(dev);
+
        add_device_randomness(dev->dev_addr, dev->addr_len);
 
        /* If the device has permanent device address, driver should
@@ -9813,8 +10109,8 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
 #define WAIT_REFS_MIN_MSECS 1
 #define WAIT_REFS_MAX_MSECS 250
 /**
- * netdev_wait_allrefs - wait until all references are gone.
- * @dev: target net_device
+ * netdev_wait_allrefs_any - wait until all references are gone.
+ * @list: list of net_devices to wait on
  *
  * This is called when unregistering network devices.
  *
@@ -9824,37 +10120,42 @@ int netdev_unregister_timeout_secs __read_mostly = 10;
  * We can get stuck here if buggy protocols don't correctly
  * call dev_put.
  */
-static void netdev_wait_allrefs(struct net_device *dev)
+static struct net_device *netdev_wait_allrefs_any(struct list_head *list)
 {
        unsigned long rebroadcast_time, warning_time;
-       int wait = 0, refcnt;
-
-       linkwatch_forget_dev(dev);
+       struct net_device *dev;
+       int wait = 0;
 
        rebroadcast_time = warning_time = jiffies;
-       refcnt = netdev_refcnt_read(dev);
 
-       while (refcnt != 1) {
+       list_for_each_entry(dev, list, todo_list)
+               if (netdev_refcnt_read(dev) == 1)
+                       return dev;
+
+       while (true) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();
 
                        /* Rebroadcast unregister notification */
-                       call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
+                       list_for_each_entry(dev, list, todo_list)
+                               call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
 
                        __rtnl_unlock();
                        rcu_barrier();
                        rtnl_lock();
 
-                       if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
-                                    &dev->state)) {
-                               /* We must not have linkwatch events
-                                * pending on unregister. If this
-                                * happens, we simply run the queue
-                                * unscheduled, resulting in a noop
-                                * for this device.
-                                */
-                               linkwatch_run_queue();
-                       }
+                       list_for_each_entry(dev, list, todo_list)
+                               if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
+                                            &dev->state)) {
+                                       /* We must not have linkwatch events
+                                        * pending on unregister. If this
+                                        * happens, we simply run the queue
+                                        * unscheduled, resulting in a noop
+                                        * for this device.
+                                        */
+                                       linkwatch_run_queue();
+                                       break;
+                               }
 
                        __rtnl_unlock();
 
@@ -9869,14 +10170,18 @@ static void netdev_wait_allrefs(struct net_device *dev)
                        wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
                }
 
-               refcnt = netdev_refcnt_read(dev);
+               list_for_each_entry(dev, list, todo_list)
+                       if (netdev_refcnt_read(dev) == 1)
+                               return dev;
 
-               if (refcnt != 1 &&
-                   time_after(jiffies, warning_time +
+               if (time_after(jiffies, warning_time +
                               netdev_unregister_timeout_secs * HZ)) {
-                       pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
-                                dev->name, refcnt);
-                       ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+                       list_for_each_entry(dev, list, todo_list) {
+                               pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
+                                        dev->name, netdev_refcnt_read(dev));
+                               ref_tracker_dir_print(&dev->refcnt_tracker, 10);
+                       }
+
                        warning_time = jiffies;
                }
        }
@@ -9908,6 +10213,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
  */
 void netdev_run_todo(void)
 {
+       struct net_device *dev, *tmp;
        struct list_head list;
 #ifdef CONFIG_LOCKDEP
        struct list_head unlink_list;
@@ -9928,26 +10234,24 @@ void netdev_run_todo(void)
 
        __rtnl_unlock();
 
-
        /* Wait for rcu callbacks to finish before next phase */
        if (!list_empty(&list))
                rcu_barrier();
 
-       while (!list_empty(&list)) {
-               struct net_device *dev
-                       = list_first_entry(&list, struct net_device, todo_list);
-               list_del(&dev->todo_list);
-
+       list_for_each_entry_safe(dev, tmp, &list, todo_list) {
                if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
-                       pr_err("network todo '%s' but state %d\n",
-                              dev->name, dev->reg_state);
-                       dump_stack();
+                       netdev_WARN(dev, "run_todo but not unregistering\n");
+                       list_del(&dev->todo_list);
                        continue;
                }
 
                dev->reg_state = NETREG_UNREGISTERED;
+               linkwatch_forget_dev(dev);
+       }
 
-               netdev_wait_allrefs(dev);
+       while (!list_empty(&list)) {
+               dev = netdev_wait_allrefs_any(&list);
+               list_del(&dev->todo_list);
 
                /* paranoia */
                BUG_ON(netdev_refcnt_read(dev) != 1);
@@ -9963,11 +10267,8 @@ void netdev_run_todo(void)
                if (dev->needs_free_netdev)
                        free_netdev(dev);
 
-               /* Report a network device has been unregistered */
-               rtnl_lock();
-               dev_net(dev)->dev_unreg_count--;
-               __rtnl_unlock();
-               wake_up(&netdev_unregistering_wq);
+               if (atomic_dec_and_test(&dev_net(dev)->dev_unreg_count))
+                       wake_up(&netdev_unregistering_wq);
 
                /* Free network device */
                kobject_put(&dev->dev.kobj);
@@ -10003,6 +10304,25 @@ void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
 }
 EXPORT_SYMBOL(netdev_stats_to_stats64);
 
+struct net_device_core_stats *netdev_core_stats_alloc(struct net_device *dev)
+{
+       struct net_device_core_stats __percpu *p;
+
+       p = alloc_percpu_gfp(struct net_device_core_stats,
+                            GFP_ATOMIC | __GFP_NOWARN);
+
+       if (p && cmpxchg(&dev->core_stats, NULL, p))
+               free_percpu(p);
+
+       /* This READ_ONCE() pairs with the cmpxchg() above */
+       p = READ_ONCE(dev->core_stats);
+       if (!p)
+               return NULL;
+
+       return this_cpu_ptr(p);
+}
+EXPORT_SYMBOL(netdev_core_stats_alloc);
+
 /**
  *     dev_get_stats   - get network device statistics
  *     @dev: device to get statistics from
@@ -10017,6 +10337,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
                                        struct rtnl_link_stats64 *storage)
 {
        const struct net_device_ops *ops = dev->netdev_ops;
+       const struct net_device_core_stats __percpu *p;
 
        if (ops->ndo_get_stats64) {
                memset(storage, 0, sizeof(*storage));
@@ -10026,9 +10347,20 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
        } else {
                netdev_stats_to_stats64(storage, &dev->stats);
        }
-       storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
-       storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
-       storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
+
+       /* This READ_ONCE() pairs with the write in netdev_core_stats_alloc() */
+       p = READ_ONCE(dev->core_stats);
+       if (p) {
+               const struct net_device_core_stats *core_stats;
+               int i;
+
+               for_each_possible_cpu(i) {
+                       core_stats = per_cpu_ptr(p, i);
+                       storage->rx_dropped += local_read(&core_stats->rx_dropped);
+                       storage->tx_dropped += local_read(&core_stats->tx_dropped);
+                       storage->rx_nohandler += local_read(&core_stats->rx_nohandler);
+               }
+       }
        return storage;
 }
 EXPORT_SYMBOL(dev_get_stats);
@@ -10172,7 +10504,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
-       dev_hold(dev);
+       __dev_hold(dev);
 #else
        refcount_set(&dev->dev_refcnt, 1);
 #endif
@@ -10290,6 +10622,8 @@ void free_netdev(struct net_device *dev)
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
 #endif
+       free_percpu(dev->core_stats);
+       dev->core_stats = NULL;
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;
 
@@ -10409,6 +10743,8 @@ void unregister_netdevice_many(struct list_head *head)
 
                dev_xdp_uninstall(dev);
 
+               netdev_offload_xstats_disable_all(dev);
+
                /* Notify protocols, that we are about to destroy
                 * this device. They should clean all the things.
                 */
@@ -10449,7 +10785,7 @@ void unregister_netdevice_many(struct list_head *head)
        synchronize_net();
 
        list_for_each_entry(dev, head, unreg_list) {
-               dev_put(dev);
+               dev_put_track(dev, &dev->dev_registered_tracker);
                net_set_todo(dev);
        }
 
@@ -10674,11 +11010,11 @@ static int dev_cpu_dead(unsigned int oldcpu)
 
        /* Process offline CPU's input_pkt_queue */
        while ((skb = __skb_dequeue(&oldsd->process_queue))) {
-               netif_rx_ni(skb);
+               netif_rx(skb);
                input_queue_head_incr(oldsd);
        }
        while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
-               netif_rx_ni(skb);
+               netif_rx(skb);
                input_queue_head_incr(oldsd);
        }
 
@@ -10732,8 +11068,7 @@ static int __net_init netdev_init(struct net *net)
        BUILD_BUG_ON(GRO_HASH_BUCKETS >
                     8 * sizeof_field(struct napi_struct, gro_bitmask));
 
-       if (net != &init_net)
-               INIT_LIST_HEAD(&net->dev_base_head);
+       INIT_LIST_HEAD(&net->dev_base_head);
 
        net->dev_name_head = netdev_create_hash();
        if (net->dev_name_head == NULL)
@@ -10849,14 +11184,14 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
        .exit = netdev_exit,
 };
 
-static void __net_exit default_device_exit(struct net *net)
+static void __net_exit default_device_exit_net(struct net *net)
 {
        struct net_device *dev, *aux;
        /*
         * Push all migratable network devices back to the
         * initial network namespace
         */
-       rtnl_lock();
+       ASSERT_RTNL();
        for_each_netdev_safe(net, dev, aux) {
                int err;
                char fb_name[IFNAMSIZ];
@@ -10880,35 +11215,6 @@ static void __net_exit default_device_exit(struct net *net)
                        BUG();
                }
        }
-       rtnl_unlock();
-}
-
-static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
-{
-       /* Return with the rtnl_lock held when there are no network
-        * devices unregistering in any network namespace in net_list.
-        */
-       struct net *net;
-       bool unregistering;
-       DEFINE_WAIT_FUNC(wait, woken_wake_function);
-
-       add_wait_queue(&netdev_unregistering_wq, &wait);
-       for (;;) {
-               unregistering = false;
-               rtnl_lock();
-               list_for_each_entry(net, net_list, exit_list) {
-                       if (net->dev_unreg_count > 0) {
-                               unregistering = true;
-                               break;
-                       }
-               }
-               if (!unregistering)
-                       break;
-               __rtnl_unlock();
-
-               wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
-       }
-       remove_wait_queue(&netdev_unregistering_wq, &wait);
 }
 
 static void __net_exit default_device_exit_batch(struct list_head *net_list)
@@ -10922,18 +11228,12 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
        struct net *net;
        LIST_HEAD(dev_kill_list);
 
-       /* To prevent network device cleanup code from dereferencing
-        * loopback devices or network devices that have been freed
-        * wait here for all pending unregistrations to complete,
-        * before unregistring the loopback device and allowing the
-        * network namespace be freed.
-        *
-        * The netdev todo list containing all network devices
-        * unregistrations that happen in default_device_exit_batch
-        * will run in the rtnl_unlock() at the end of
-        * default_device_exit_batch.
-        */
-       rtnl_lock_unregistering(net_list);
+       rtnl_lock();
+       list_for_each_entry(net, net_list, exit_list) {
+               default_device_exit_net(net);
+               cond_resched();
+       }
+
        list_for_each_entry(net, net_list, exit_list) {
                for_each_netdev_reverse(net, dev) {
                        if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
@@ -10947,7 +11247,6 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list)
 }
 
 static struct pernet_operations __net_initdata default_device_ops = {
-       .exit = default_device_exit,
        .exit_batch = default_device_exit_batch,
 };