hv_netvsc: Add support for XDP_REDIRECT
authorHaiyang Zhang <haiyangz@microsoft.com>
Thu, 7 Apr 2022 20:21:34 +0000 (13:21 -0700)
committerJakub Kicinski <kuba@kernel.org>
Tue, 12 Apr 2022 01:25:47 +0000 (18:25 -0700)
Handle XDP_REDIRECT action in netvsc driver.
Also, transparently pass ndo_xdp_xmit to VF when available.

Signed-off-by: Haiyang Zhang <haiyangz@microsoft.com>
Link: https://lore.kernel.org/r/1649362894-20077-1-git-send-email-haiyangz@microsoft.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/net/hyperv/hyperv_net.h
drivers/net/hyperv/netvsc.c
drivers/net/hyperv/netvsc_bpf.c
drivers/net/hyperv/netvsc_drv.c

index cf69da0..25b38a3 100644 (file)
@@ -15,6 +15,7 @@
 #include <linux/list.h>
 #include <linux/hyperv.h>
 #include <linux/rndis.h>
+#include <linux/jhash.h>
 
 /* RSS related */
 #define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203  /* query only */
@@ -237,6 +238,7 @@ int netvsc_recv_callback(struct net_device *net,
 void netvsc_channel_cb(void *context);
 int netvsc_poll(struct napi_struct *napi, int budget);
 
+void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev);
 u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
                   struct xdp_buff *xdp);
 unsigned int netvsc_xdp_fraglen(unsigned int len);
@@ -246,6 +248,8 @@ int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog,
                   struct netvsc_device *nvdev);
 int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog);
 int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf);
+int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
+                      struct xdp_frame **frames, u32 flags);
 
 int rndis_set_subchannel(struct net_device *ndev,
                         struct netvsc_device *nvdev,
@@ -942,12 +946,21 @@ struct nvsc_rsc {
 #define NVSC_RSC_CSUM_INFO     BIT(1)  /* valid/present bit for 'csum_info' */
 #define NVSC_RSC_HASH_INFO     BIT(2)  /* valid/present bit for 'hash_info' */
 
-struct netvsc_stats {
+struct netvsc_stats_tx {
+       u64 packets;
+       u64 bytes;
+       u64 xdp_xmit;
+       struct u64_stats_sync syncp;
+};
+
+struct netvsc_stats_rx {
        u64 packets;
        u64 bytes;
        u64 broadcast;
        u64 multicast;
        u64 xdp_drop;
+       u64 xdp_redirect;
+       u64 xdp_tx;
        struct u64_stats_sync syncp;
 };
 
@@ -1046,6 +1059,55 @@ struct net_device_context {
        struct netvsc_device_info *saved_netvsc_dev_info;
 };
 
+/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
+ * packets. We can use ethtool to change UDP hash level when necessary.
+ */
+static inline u32 netvsc_get_hash(struct sk_buff *skb,
+                                 const struct net_device_context *ndc)
+{
+       struct flow_keys flow;
+       u32 hash, pkt_proto = 0;
+       static u32 hashrnd __read_mostly;
+
+       net_get_random_once(&hashrnd, sizeof(hashrnd));
+
+       if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
+               return 0;
+
+       switch (flow.basic.ip_proto) {
+       case IPPROTO_TCP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_TCP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_TCP6_L4HASH;
+
+               break;
+
+       case IPPROTO_UDP:
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       pkt_proto = HV_UDP4_L4HASH;
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       pkt_proto = HV_UDP6_L4HASH;
+
+               break;
+       }
+
+       if (pkt_proto & ndc->l4_hash) {
+               return skb_get_hash(skb);
+       } else {
+               if (flow.basic.n_proto == htons(ETH_P_IP))
+                       hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
+               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
+                       hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
+               else
+                       return 0;
+
+               __skb_set_sw_hash(skb, hash, false);
+       }
+
+       return hash;
+}
+
 /* Per channel data */
 struct netvsc_channel {
        struct vmbus_channel *channel;
@@ -1060,9 +1122,10 @@ struct netvsc_channel {
 
        struct bpf_prog __rcu *bpf_prog;
        struct xdp_rxq_info xdp_rxq;
+       bool xdp_flush;
 
-       struct netvsc_stats tx_stats;
-       struct netvsc_stats rx_stats;
+       struct netvsc_stats_tx tx_stats;
+       struct netvsc_stats_rx rx_stats;
 };
 
 /* Per netvsc device */
index 4061af5..6e42cb0 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/rtnetlink.h>
 #include <linux/prefetch.h>
+#include <linux/filter.h>
 
 #include <asm/sync_bitops.h>
 #include <asm/mshyperv.h>
@@ -805,7 +806,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
                struct hv_netvsc_packet *packet
                        = (struct hv_netvsc_packet *)skb->cb;
                u32 send_index = packet->send_buf_index;
-               struct netvsc_stats *tx_stats;
+               struct netvsc_stats_tx *tx_stats;
 
                if (send_index != NETVSC_INVALID_INDEX)
                        netvsc_free_send_slot(net_device, send_index);
@@ -1670,12 +1671,17 @@ int netvsc_poll(struct napi_struct *napi, int budget)
        if (!nvchan->desc)
                nvchan->desc = hv_pkt_iter_first(channel);
 
+       nvchan->xdp_flush = false;
+
        while (nvchan->desc && work_done < budget) {
                work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
                                                    ndev, nvchan->desc, budget);
                nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
        }
 
+       if (nvchan->xdp_flush)
+               xdp_do_flush();
+
        /* Send any pending receive completions */
        ret = send_recv_completions(ndev, net_device, nvchan);
 
index 232c4a0..4a95226 100644 (file)
@@ -10,6 +10,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/ethtool.h>
+#include <linux/netpoll.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
 #include <linux/kernel.h>
 u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
                   struct xdp_buff *xdp)
 {
+       struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
        void *data = nvchan->rsc.data[0];
        u32 len = nvchan->rsc.len[0];
        struct page *page = NULL;
        struct bpf_prog *prog;
        u32 act = XDP_PASS;
+       bool drop = true;
 
        xdp->data_hard_start = NULL;
 
@@ -60,9 +63,34 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
        switch (act) {
        case XDP_PASS:
        case XDP_TX:
+               drop = false;
+               break;
+
        case XDP_DROP:
                break;
 
+       case XDP_REDIRECT:
+               if (!xdp_do_redirect(ndev, xdp, prog)) {
+                       nvchan->xdp_flush = true;
+                       drop = false;
+
+                       u64_stats_update_begin(&rx_stats->syncp);
+
+                       rx_stats->xdp_redirect++;
+                       rx_stats->packets++;
+                       rx_stats->bytes += nvchan->rsc.pktlen;
+
+                       u64_stats_update_end(&rx_stats->syncp);
+
+                       break;
+               } else {
+                       u64_stats_update_begin(&rx_stats->syncp);
+                       rx_stats->xdp_drop++;
+                       u64_stats_update_end(&rx_stats->syncp);
+               }
+
+               fallthrough;
+
        case XDP_ABORTED:
                trace_xdp_exception(ndev, prog, act);
                break;
@@ -74,7 +102,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
 out:
        rcu_read_unlock();
 
-       if (page && act != XDP_PASS && act != XDP_TX) {
+       if (page && drop) {
                __free_page(page);
                xdp->data_hard_start = NULL;
        }
@@ -197,3 +225,68 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
                return -EINVAL;
        }
 }
+
+static int netvsc_ndoxdp_xmit_fm(struct net_device *ndev,
+                                struct xdp_frame *frame, u16 q_idx)
+{
+       struct sk_buff *skb;
+
+       skb = xdp_build_skb_from_frame(frame, ndev);
+       if (unlikely(!skb))
+               return -ENOMEM;
+
+       netvsc_get_hash(skb, netdev_priv(ndev));
+
+       skb_record_rx_queue(skb, q_idx);
+
+       netvsc_xdp_xmit(skb, ndev);
+
+       return 0;
+}
+
+int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
+                      struct xdp_frame **frames, u32 flags)
+{
+       struct net_device_context *ndev_ctx = netdev_priv(ndev);
+       const struct net_device_ops *vf_ops;
+       struct netvsc_stats_tx *tx_stats;
+       struct netvsc_device *nvsc_dev;
+       struct net_device *vf_netdev;
+       int i, count = 0;
+       u16 q_idx;
+
+       /* Don't transmit if netvsc_device is gone */
+       nvsc_dev = rcu_dereference_bh(ndev_ctx->nvdev);
+       if (unlikely(!nvsc_dev || nvsc_dev->destroy))
+               return 0;
+
+       /* If VF is present and up then redirect packets to it.
+        * Skip the VF if it is marked down or has no carrier.
+        * If netpoll is in uses, then VF can not be used either.
+        */
+       vf_netdev = rcu_dereference_bh(ndev_ctx->vf_netdev);
+       if (vf_netdev && netif_running(vf_netdev) &&
+           netif_carrier_ok(vf_netdev) && !netpoll_tx_running(ndev) &&
+           vf_netdev->netdev_ops->ndo_xdp_xmit &&
+           ndev_ctx->data_path_is_vf) {
+               vf_ops = vf_netdev->netdev_ops;
+               return vf_ops->ndo_xdp_xmit(vf_netdev, n, frames, flags);
+       }
+
+       q_idx = smp_processor_id() % ndev->real_num_tx_queues;
+
+       for (i = 0; i < n; i++) {
+               if (netvsc_ndoxdp_xmit_fm(ndev, frames[i], q_idx))
+                       break;
+
+               count++;
+       }
+
+       tx_stats = &nvsc_dev->chan_table[q_idx].tx_stats;
+
+       u64_stats_update_begin(&tx_stats->syncp);
+       tx_stats->xdp_xmit += count;
+       u64_stats_update_end(&tx_stats->syncp);
+
+       return count;
+}
index fde1c49..27f6bbc 100644 (file)
@@ -242,56 +242,6 @@ static inline void *init_ppi_data(struct rndis_message *msg,
        return ppi + 1;
 }
 
-/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
- * packets. We can use ethtool to change UDP hash level when necessary.
- */
-static inline u32 netvsc_get_hash(
-       struct sk_buff *skb,
-       const struct net_device_context *ndc)
-{
-       struct flow_keys flow;
-       u32 hash, pkt_proto = 0;
-       static u32 hashrnd __read_mostly;
-
-       net_get_random_once(&hashrnd, sizeof(hashrnd));
-
-       if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
-               return 0;
-
-       switch (flow.basic.ip_proto) {
-       case IPPROTO_TCP:
-               if (flow.basic.n_proto == htons(ETH_P_IP))
-                       pkt_proto = HV_TCP4_L4HASH;
-               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
-                       pkt_proto = HV_TCP6_L4HASH;
-
-               break;
-
-       case IPPROTO_UDP:
-               if (flow.basic.n_proto == htons(ETH_P_IP))
-                       pkt_proto = HV_UDP4_L4HASH;
-               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
-                       pkt_proto = HV_UDP6_L4HASH;
-
-               break;
-       }
-
-       if (pkt_proto & ndc->l4_hash) {
-               return skb_get_hash(skb);
-       } else {
-               if (flow.basic.n_proto == htons(ETH_P_IP))
-                       hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
-               else if (flow.basic.n_proto == htons(ETH_P_IPV6))
-                       hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
-               else
-                       return 0;
-
-               __skb_set_sw_hash(skb, hash, false);
-       }
-
-       return hash;
-}
-
 static inline int netvsc_get_tx_queue(struct net_device *ndev,
                                      struct sk_buff *skb, int old_idx)
 {
@@ -804,7 +754,7 @@ void netvsc_linkstatus_callback(struct net_device *net,
 }
 
 /* This function should only be called after skb_record_rx_queue() */
-static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
+void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
 {
        int rc;
 
@@ -925,7 +875,7 @@ int netvsc_recv_callback(struct net_device *net,
        struct vmbus_channel *channel = nvchan->channel;
        u16 q_idx = channel->offermsg.offer.sub_channel_index;
        struct sk_buff *skb;
-       struct netvsc_stats *rx_stats = &nvchan->rx_stats;
+       struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
        struct xdp_buff xdp;
        u32 act;
 
@@ -934,6 +884,9 @@ int netvsc_recv_callback(struct net_device *net,
 
        act = netvsc_run_xdp(net, nvchan, &xdp);
 
+       if (act == XDP_REDIRECT)
+               return NVSP_STAT_SUCCESS;
+
        if (act != XDP_PASS && act != XDP_TX) {
                u64_stats_update_begin(&rx_stats->syncp);
                rx_stats->xdp_drop++;
@@ -958,6 +911,9 @@ int netvsc_recv_callback(struct net_device *net,
         * statistics will not work correctly.
         */
        u64_stats_update_begin(&rx_stats->syncp);
+       if (act == XDP_TX)
+               rx_stats->xdp_tx++;
+
        rx_stats->packets++;
        rx_stats->bytes += nvchan->rsc.pktlen;
 
@@ -1353,28 +1309,29 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
        /* fetch percpu stats of netvsc */
        for (i = 0; i < nvdev->num_chn; i++) {
                const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
-               const struct netvsc_stats *stats;
+               const struct netvsc_stats_tx *tx_stats;
+               const struct netvsc_stats_rx *rx_stats;
                struct netvsc_ethtool_pcpu_stats *this_tot =
                        &pcpu_tot[nvchan->channel->target_cpu];
                u64 packets, bytes;
                unsigned int start;
 
-               stats = &nvchan->tx_stats;
+               tx_stats = &nvchan->tx_stats;
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+                       packets = tx_stats->packets;
+                       bytes = tx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
 
                this_tot->tx_bytes      += bytes;
                this_tot->tx_packets    += packets;
 
-               stats = &nvchan->rx_stats;
+               rx_stats = &nvchan->rx_stats;
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+                       packets = rx_stats->packets;
+                       bytes = rx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
 
                this_tot->rx_bytes      += bytes;
                this_tot->rx_packets    += packets;
@@ -1406,27 +1363,28 @@ static void netvsc_get_stats64(struct net_device *net,
 
        for (i = 0; i < nvdev->num_chn; i++) {
                const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
-               const struct netvsc_stats *stats;
+               const struct netvsc_stats_tx *tx_stats;
+               const struct netvsc_stats_rx *rx_stats;
                u64 packets, bytes, multicast;
                unsigned int start;
 
-               stats = &nvchan->tx_stats;
+               tx_stats = &nvchan->tx_stats;
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+                       packets = tx_stats->packets;
+                       bytes = tx_stats->bytes;
+               } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
 
                t->tx_bytes     += bytes;
                t->tx_packets   += packets;
 
-               stats = &nvchan->rx_stats;
+               rx_stats = &nvchan->rx_stats;
                do {
-                       start = u64_stats_fetch_begin_irq(&stats->syncp);
-                       packets = stats->packets;
-                       bytes = stats->bytes;
-                       multicast = stats->multicast + stats->broadcast;
-               } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+                       packets = rx_stats->packets;
+                       bytes = rx_stats->bytes;
+                       multicast = rx_stats->multicast + rx_stats->broadcast;
+               } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
 
                t->rx_bytes     += bytes;
                t->rx_packets   += packets;
@@ -1515,8 +1473,8 @@ static const struct {
 /* statistics per queue (rx/tx packets/bytes) */
 #define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
 
-/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
-#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
+/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */
+#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8)
 
 static int netvsc_get_sset_count(struct net_device *dev, int string_set)
 {
@@ -1543,12 +1501,16 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
        struct net_device_context *ndc = netdev_priv(dev);
        struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
        const void *nds = &ndc->eth_stats;
-       const struct netvsc_stats *qstats;
+       const struct netvsc_stats_tx *tx_stats;
+       const struct netvsc_stats_rx *rx_stats;
        struct netvsc_vf_pcpu_stats sum;
        struct netvsc_ethtool_pcpu_stats *pcpu_sum;
        unsigned int start;
        u64 packets, bytes;
        u64 xdp_drop;
+       u64 xdp_redirect;
+       u64 xdp_tx;
+       u64 xdp_xmit;
        int i, j, cpu;
 
        if (!nvdev)
@@ -1562,26 +1524,32 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
                data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
 
        for (j = 0; j < nvdev->num_chn; j++) {
-               qstats = &nvdev->chan_table[j].tx_stats;
+               tx_stats = &nvdev->chan_table[j].tx_stats;
 
                do {
-                       start = u64_stats_fetch_begin_irq(&qstats->syncp);
-                       packets = qstats->packets;
-                       bytes = qstats->bytes;
-               } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
+                       packets = tx_stats->packets;
+                       bytes = tx_stats->bytes;
+                       xdp_xmit = tx_stats->xdp_xmit;
+               } while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
                data[i++] = packets;
                data[i++] = bytes;
+               data[i++] = xdp_xmit;
 
-               qstats = &nvdev->chan_table[j].rx_stats;
+               rx_stats = &nvdev->chan_table[j].rx_stats;
                do {
-                       start = u64_stats_fetch_begin_irq(&qstats->syncp);
-                       packets = qstats->packets;
-                       bytes = qstats->bytes;
-                       xdp_drop = qstats->xdp_drop;
-               } while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
+                       start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
+                       packets = rx_stats->packets;
+                       bytes = rx_stats->bytes;
+                       xdp_drop = rx_stats->xdp_drop;
+                       xdp_redirect = rx_stats->xdp_redirect;
+                       xdp_tx = rx_stats->xdp_tx;
+               } while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
                data[i++] = packets;
                data[i++] = bytes;
                data[i++] = xdp_drop;
+               data[i++] = xdp_redirect;
+               data[i++] = xdp_tx;
        }
 
        pcpu_sum = kvmalloc_array(num_possible_cpus(),
@@ -1622,9 +1590,12 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
                for (i = 0; i < nvdev->num_chn; i++) {
                        ethtool_sprintf(&p, "tx_queue_%u_packets", i);
                        ethtool_sprintf(&p, "tx_queue_%u_bytes", i);
+                       ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i);
                        ethtool_sprintf(&p, "rx_queue_%u_packets", i);
                        ethtool_sprintf(&p, "rx_queue_%u_bytes", i);
                        ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i);
+                       ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i);
+                       ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i);
                }
 
                for_each_present_cpu(cpu) {
@@ -2057,6 +2028,7 @@ static const struct net_device_ops device_ops = {
        .ndo_select_queue =             netvsc_select_queue,
        .ndo_get_stats64 =              netvsc_get_stats64,
        .ndo_bpf =                      netvsc_bpf,
+       .ndo_xdp_xmit =                 netvsc_ndoxdp_xmit,
 };
 
 /*