Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net

author David S. Miller <davem@davemloft.net>

Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)

committer David S. Miller <davem@davemloft.net>

Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
author David S. Miller <davem@davemloft.net>
Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
committer David S. Miller <davem@davemloft.net>
Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
diff --combined MAINTAINERS

index ad21462,8d23b0e..1117789
--- 1/MAINTAINERS
--- 2/MAINTAINERS
+++ b/MAINTAINERS
@@@ -261,8 -261,8 +261,8 @@@ ABI/AP
   L:    linux-api@vger.kernel.org
   F:    include/linux/syscalls.h
   F:    kernel/sys_ni.c
- F:    include/uapi/
- F:    arch/*/include/uapi/
+ X:    include/uapi/
+ X:    arch/*/include/uapi/
   
   ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
   M:    Hans de Goede <hdegoede@redhat.com>
@@@ -1181,7 -1181,7 +1181,7 @@@ M:      Joel Fernandes <joel@joelfernandes.o
   M:    Christian Brauner <christian@brauner.io>
   M:    Hridya Valsaraju <hridya@google.com>
   M:    Suren Baghdasaryan <surenb@google.com>
- L:    devel@driverdev.osuosl.org
+ L:    linux-kernel@vger.kernel.org
   S:    Supported
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
   F:    drivers/android/
@@@ -1530,7 -1530,6 +1530,7 @@@ F:      Documentation/devicetree/bindings/dm
   F:    Documentation/devicetree/bindings/i2c/i2c-owl.yaml
   F:    Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.yaml
   F:    Documentation/devicetree/bindings/mmc/owl-mmc.yaml
+ +F:    Documentation/devicetree/bindings/net/actions,owl-emac.yaml
   F:    Documentation/devicetree/bindings/pinctrl/actions,*
   F:    Documentation/devicetree/bindings/power/actions,owl-sps.txt
   F:    Documentation/devicetree/bindings/timer/actions,owl-timer.txt
@@@ -1543,7 -1542,6 +1543,7 @@@ F:      drivers/dma/owl-dma.
   F:    drivers/i2c/busses/i2c-owl.c
   F:    drivers/irqchip/irq-owl-sirq.c
   F:    drivers/mmc/host/owl-mmc.c
+ +F:    drivers/net/ethernet/actions/
   F:    drivers/pinctrl/actions/*
   F:    drivers/soc/actions/
   F:    include/dt-bindings/power/owl-*
@@@ -3235,7 -3233,6 +3235,7 @@@ T:      git git://git.kernel.org/pub/scm/lin
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
   F:    Documentation/bpf/
   F:    Documentation/networking/filter.rst
+ +F:    Documentation/userspace-api/ebpf/
   F:    arch/*/net/*
   F:    include/linux/bpf*
   F:    include/linux/filter.h
@@@ -3250,7 -3247,6 +3250,7 @@@ F:      net/core/filter.
   F:    net/sched/act_bpf.c
   F:    net/sched/cls_bpf.c
   F:    samples/bpf/
+ +F:    scripts/bpf_doc.py
   F:    tools/bpf/
   F:    tools/lib/bpf/
   F:    tools/testing/selftests/bpf/
@@@ -5475,11 -5471,11 +5475,11 @@@ F:   drivers/net/ethernet/freescale/dpaa2
   F:    drivers/net/ethernet/freescale/dpaa2/dpni*
   
   DPAA2 ETHERNET SWITCH DRIVER
- -M:    Ioana Radulescu <ruxandra.radulescu@nxp.com>
   M:    Ioana Ciornei <ioana.ciornei@nxp.com>
- -L:    linux-kernel@vger.kernel.org
+ +L:    netdev@vger.kernel.org
   S:    Maintained
- -F:    drivers/staging/fsl-dpaa2/ethsw
+ +F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-switch*
+ +F:    drivers/net/ethernet/freescale/dpaa2/dpsw*
   
   DPT_I2O SCSI RAID DRIVER
   M:    Adaptec OEM Raid Solutions <aacraid@microsemi.com>
@@@ -5839,7 -5835,7 +5839,7 @@@ M:      David Airlie <airlied@linux.ie
   M:    Daniel Vetter <daniel@ffwll.ch>
   L:    dri-devel@lists.freedesktop.org
   S:    Maintained
- B:    https://bugs.freedesktop.org/
+ B:    https://gitlab.freedesktop.org/drm
   C:    irc://chat.freenode.net/dri-devel
   T:    git git://anongit.freedesktop.org/drm/drm
   F:    Documentation/devicetree/bindings/display/
@@@ -8120,7 -8116,6 +8120,6 @@@ F:      drivers/crypto/hisilicon/sec2/sec_ma
   
   HISILICON STAGING DRIVERS FOR HIKEY 960/970
   M:    Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
- L:    devel@driverdev.osuosl.org
   S:    Maintained
   F:    drivers/staging/hikey9xx/
   
@@@ -8525,6 -8520,7 +8524,7 @@@ IBM Power SRIOV Virtual NIC Device Driv
   M:    Dany Madden <drt@linux.ibm.com>
   M:    Lijun Pan <ljp@linux.ibm.com>
   M:    Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+ R:    Thomas Falcon <tlfalcon@linux.ibm.com>
   L:    netdev@vger.kernel.org
   S:    Supported
   F:    drivers/net/ethernet/ibm/ibmvnic.*
@@@ -12542,7 -12538,7 +12542,7 @@@ NETWORKING [MPTCP
   M:    Mat Martineau <mathew.j.martineau@linux.intel.com>
   M:    Matthieu Baerts <matthieu.baerts@tessares.net>
   L:    netdev@vger.kernel.org
- L:    mptcp@lists.01.org
+ L:    mptcp@lists.linux.dev
   S:    Maintained
   W:    https://github.com/multipath-tcp/mptcp_net-next/wiki
   B:    https://github.com/multipath-tcp/mptcp_net-next/issues
@@@ -14713,15 -14709,11 +14713,11 @@@ F:        drivers/net/ethernet/qlogic/qlcnic
   QLOGIC QLGE 10Gb ETHERNET DRIVER
   M:    Manish Chopra <manishc@marvell.com>
   M:    GR-Linux-NIC-Dev@marvell.com
- L:    netdev@vger.kernel.org
- S:    Supported
- F:    drivers/staging/qlge/
- 
- QLOGIC QLGE 10Gb ETHERNET DRIVER
   M:    Coiby Xu <coiby.xu@gmail.com>
   L:    netdev@vger.kernel.org
- S:    Maintained
+ S:    Supported
   F:    Documentation/networking/device_drivers/qlogic/qlge.rst
+ F:    drivers/staging/qlge/
   
   QM1D1B0004 MEDIA DRIVER
   M:    Akihiro Tsukada <tskd08@gmail.com>
@@@ -16891,8 -16883,10 +16887,10 @@@ F: tools/spi
   
   SPIDERNET NETWORK DRIVER for CELL
   M:    Ishizaki Kou <kou.ishizaki@toshiba.co.jp>
+ M:    Geoff Levand <geoff@infradead.org>
   L:    netdev@vger.kernel.org
- S:    Supported
+ L:    linuxppc-dev@lists.ozlabs.org
+ S:    Maintained
   F:    Documentation/networking/device_drivers/ethernet/toshiba/spider_net.rst
   F:    drivers/net/ethernet/toshiba/spider_net*
   
@@@ -17044,7 -17038,7 +17042,7 @@@ F:   drivers/staging/vt665?
   
   STAGING SUBSYSTEM
   M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L:    devel@driverdev.osuosl.org
+ L:    linux-staging@lists.linux.dev
   S:    Supported
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
   F:    drivers/staging/
@@@ -19139,7 -19133,7 +19137,7 @@@ VME SUBSYSTE
   M:    Martyn Welch <martyn@welchs.me.uk>
   M:    Manohar Vanga <manohar.vanga@gmail.com>
   M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L:    devel@driverdev.osuosl.org
+ L:    linux-kernel@vger.kernel.org
   S:    Maintained
   T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
   F:    Documentation/driver-api/vme.rst
@@@ -19170,7 -19164,7 +19168,7 @@@ S:   Maintaine
   F:    drivers/infiniband/hw/vmw_pvrdma/
   
   VMware PVSCSI driver
- M:    Jim Gill <jgill@vmware.com>
+ M:    Vishal Bhakta <vbhakta@vmware.com>
   M:    VMware PV-Drivers <pv-drivers@vmware.com>
   L:    linux-scsi@vger.kernel.org
   S:    Maintained
diff --combined drivers/atm/fore200e.c

index 0b9c99c,495fd0a..b508df2
--- 1/drivers/atm/fore200e.c
--- 2/drivers/atm/fore200e.c
+++ b/drivers/atm/fore200e.c
@@@ -21,6 -21,7 +21,6 @@@
   #include <linux/module.h>
   #include <linux/atmdev.h>
   #include <linux/sonet.h>
- -#include <linux/atm_suni.h>
   #include <linux/dma-mapping.h>
   #include <linux/delay.h>
   #include <linux/firmware.h>
@@@ -99,8 -100,6 +99,6 @@@ static LIST_HEAD(fore200e_boards)
   
   MODULE_AUTHOR("Christophe Lizzi - credits to Uwe Dannowski and Heikki Vatiainen");
   MODULE_DESCRIPTION("FORE Systems 200E-series ATM driver - version " FORE200E_VERSION);
- MODULE_SUPPORTED_DEVICE("PCA-200E, SBA-200E");
- 
   
   static const int fore200e_rx_buf_nbr[ BUFFER_SCHEME_NBR ][ BUFFER_MAGN_NBR ] = {
       { BUFFER_S1_NBR, BUFFER_L1_NBR },
diff --combined drivers/net/dsa/b53/b53_common.c

index 8d5a82d,eb44372..3ca6b39
--- 1/drivers/net/dsa/b53/b53_common.c
--- 2/drivers/net/dsa/b53/b53_common.c
+++ b/drivers/net/dsa/b53/b53_common.c
@@@ -349,7 -349,7 +349,7 @@@ static void b53_set_forwarding(struct b
         b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt);
   }
   
- -static void b53_enable_vlan(struct b53_device *dev, bool enable,
+ +static void b53_enable_vlan(struct b53_device *dev, int port, bool enable,
                             bool enable_filtering)
   {
         u8 mgmt, vc0, vc1, vc4 = 0, vc5;
@@@ -431,9 -431,6 +431,9 @@@
         b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
   
         dev->vlan_enabled = enable;
+ +
+ +      dev_dbg(dev->dev, "Port %d VLAN enabled: %d, filtering: %d\n",
+ +              port, enable, enable_filtering);
   }
   
   static int b53_set_jumbo(struct b53_device *dev, bool enable, bool allow_10_100)
@@@ -746,7 -743,7 +746,7 @@@ int b53_configure_vlan(struct dsa_switc
                 b53_do_vlan_op(dev, VTA_CMD_CLEAR);
         }
   
- -      b53_enable_vlan(dev, dev->vlan_enabled, ds->vlan_filtering);
+ +      b53_enable_vlan(dev, -1, dev->vlan_enabled, ds->vlan_filtering);
   
         b53_for_each_port(dev, i)
                 b53_write16(dev, B53_VLAN_PAGE,
@@@ -1108,13 -1105,6 +1108,6 @@@ static int b53_setup(struct dsa_switch 
                         b53_disable_port(ds, port);
         }
   
-       /* Let DSA handle the case were multiple bridges span the same switch
-        * device and different VLAN awareness settings are requested, which
-        * would be breaking filtering semantics for any of the other bridge
-        * devices. (not hardware supported)
-        */
-       ds->vlan_filtering_is_global = true;
- 
         return b53_setup_devlink_resources(ds);
   }
   
@@@ -1432,7 -1422,7 +1425,7 @@@ int b53_vlan_filtering(struct dsa_switc
   {
         struct b53_device *dev = ds->priv;
   
- -      b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering);
+ +      b53_enable_vlan(dev, port, dev->vlan_enabled, vlan_filtering);
   
         return 0;
   }
@@@ -1457,7 -1447,7 +1450,7 @@@ static int b53_vlan_prepare(struct dsa_
         if (vlan->vid >= dev->num_vlans)
                 return -ERANGE;
   
- -      b53_enable_vlan(dev, true, ds->vlan_filtering);
+ +      b53_enable_vlan(dev, port, true, ds->vlan_filtering);
   
         return 0;
   }
@@@ -2055,17 -2045,15 +2048,17 @@@ enum dsa_tag_protocol b53_get_tag_proto
   {
         struct b53_device *dev = ds->priv;
   
- -      /* Older models (5325, 5365) support a different tag format that we do
- -       * not support in net/dsa/tag_brcm.c yet.
- -       */
- -      if (is5325(dev) || is5365(dev) ||
- -          !b53_can_enable_brcm_tags(ds, port, mprot)) {
+ +      if (!b53_can_enable_brcm_tags(ds, port, mprot)) {
                 dev->tag_protocol = DSA_TAG_PROTO_NONE;
                 goto out;
         }
   
+ +      /* Older models require a different 6 byte tag */
+ +      if (is5325(dev) || is5365(dev) || is63xx(dev)) {
+ +              dev->tag_protocol = DSA_TAG_PROTO_BRCM_LEGACY;
+ +              goto out;
+ +      }
+ +
         /* Broadcom BCM58xx chips have a flow accelerator on Port 8
          * which requires us to use the prepended Broadcom tag type
          */
@@@ -2669,6 -2657,13 +2662,13 @@@ struct b53_device *b53_switch_alloc(str
         ds->ops = &b53_switch_ops;
         ds->untag_bridge_pvid = true;
         dev->vlan_enabled = true;
+       /* Let DSA handle the case were multiple bridges span the same switch
+        * device and different VLAN awareness settings are requested, which
+        * would be breaking filtering semantics for any of the other bridge
+        * devices. (not hardware supported)
+        */
+       ds->vlan_filtering_is_global = true;
+ 
         mutex_init(&dev->reg_mutex);
         mutex_init(&dev->stats_mutex);
   
diff --combined drivers/net/dsa/bcm_sf2.c

index 7e0ca80,ba5d546..9150038
--- 1/drivers/net/dsa/bcm_sf2.c
--- 2/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@@ -32,36 -32,6 +32,36 @@@
   #include "b53/b53_priv.h"
   #include "b53/b53_regs.h"
   
+ +static u16 bcm_sf2_reg_rgmii_cntrl(struct bcm_sf2_priv *priv, int port)
+ +{
+ +      switch (priv->type) {
+ +      case BCM4908_DEVICE_ID:
+ +              switch (port) {
+ +              case 7:
+ +                      return REG_RGMII_11_CNTRL;
+ +              default:
+ +                      break;
+ +              }
+ +              break;
+ +      default:
+ +              switch (port) {
+ +              case 0:
+ +                      return REG_RGMII_0_CNTRL;
+ +              case 1:
+ +                      return REG_RGMII_1_CNTRL;
+ +              case 2:
+ +                      return REG_RGMII_2_CNTRL;
+ +              default:
+ +                      break;
+ +              }
+ +      }
+ +
+ +      WARN_ONCE(1, "Unsupported port %d\n", port);
+ +
+ +      /* RO fallback reg */
+ +      return REG_SWITCH_STATUS;
+ +}
+ +
   /* Return the number of active ports, not counting the IMP (CPU) port */
   static unsigned int bcm_sf2_num_active_ports(struct dsa_switch *ds)
   {
@@@ -144,7 -114,10 +144,10 @@@ static void bcm_sf2_imp_setup(struct ds
                 /* Force link status for IMP port */
                 reg = core_readl(priv, offset);
                 reg |= (MII_SW_OR | LINK_STS);
-               reg &= ~GMII_SPEED_UP_2G;
+               if (priv->type == BCM4908_DEVICE_ID)
+                       reg |= GMII_SPEED_UP_2G;
+               else
+                       reg &= ~GMII_SPEED_UP_2G;
                 core_writel(priv, reg, offset);
   
                 /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
@@@ -462,44 -435,6 +465,44 @@@ static int bcm_sf2_sw_rst(struct bcm_sf
         return 0;
   }
   
+ +static void bcm_sf2_crossbar_setup(struct bcm_sf2_priv *priv)
+ +{
+ +      struct device *dev = priv->dev->ds->dev;
+ +      int shift;
+ +      u32 mask;
+ +      u32 reg;
+ +      int i;
+ +
+ +      mask = BIT(priv->num_crossbar_int_ports) - 1;
+ +
+ +      reg = reg_readl(priv, REG_CROSSBAR);
+ +      switch (priv->type) {
+ +      case BCM4908_DEVICE_ID:
+ +              shift = CROSSBAR_BCM4908_INT_P7 * priv->num_crossbar_int_ports;
+ +              reg &= ~(mask << shift);
+ +              if (0) /* FIXME */
+ +                      reg |= CROSSBAR_BCM4908_EXT_SERDES << shift;
+ +              else if (priv->int_phy_mask & BIT(7))
+ +                      reg |= CROSSBAR_BCM4908_EXT_GPHY4 << shift;
+ +              else if (phy_interface_mode_is_rgmii(priv->port_sts[7].mode))
+ +                      reg |= CROSSBAR_BCM4908_EXT_RGMII << shift;
+ +              else if (WARN(1, "Invalid port mode\n"))
+ +                      return;
+ +              break;
+ +      default:
+ +              return;
+ +      }
+ +      reg_writel(priv, reg, REG_CROSSBAR);
+ +
+ +      reg = reg_readl(priv, REG_CROSSBAR);
+ +      for (i = 0; i < priv->num_crossbar_int_ports; i++) {
+ +              shift = i * priv->num_crossbar_int_ports;
+ +
+ +              dev_dbg(dev, "crossbar int port #%d - ext port #%d\n", i,
+ +                      (reg >> shift) & mask);
+ +      }
+ +}
+ +
   static void bcm_sf2_intr_disable(struct bcm_sf2_priv *priv)
   {
         intrl2_0_mask_set(priv, 0xffffffff);
@@@ -511,11 -446,10 +514,11 @@@
   static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv,
                                    struct device_node *dn)
   {
+ +      struct device *dev = priv->dev->ds->dev;
+ +      struct bcm_sf2_port_status *port_st;
         struct device_node *port;
         unsigned int port_num;
         struct property *prop;
- -      phy_interface_t mode;
         int err;
   
         priv->moca_port = -1;
@@@ -524,26 -458,19 +527,26 @@@
                 if (of_property_read_u32(port, "reg", &port_num))
                         continue;
   
+ +              if (port_num >= DSA_MAX_PORTS) {
+ +                      dev_err(dev, "Invalid port number %d\n", port_num);
+ +                      continue;
+ +              }
+ +
+ +              port_st = &priv->port_sts[port_num];
+ +
                 /* Internal PHYs get assigned a specific 'phy-mode' property
                  * value: "internal" to help flag them before MDIO probing
                  * has completed, since they might be turned off at that
                  * time
                  */
- -              err = of_get_phy_mode(port, &mode);
+ +              err = of_get_phy_mode(port, &port_st->mode);
                 if (err)
                         continue;
   
- -              if (mode == PHY_INTERFACE_MODE_INTERNAL)
+ +              if (port_st->mode == PHY_INTERFACE_MODE_INTERNAL)
                         priv->int_phy_mask |= 1 << port_num;
   
- -              if (mode == PHY_INTERFACE_MODE_MOCA)
+ +              if (port_st->mode == PHY_INTERFACE_MODE_MOCA)
                         priv->moca_port = port_num;
   
                 if (of_property_read_bool(port, "brcm,use-bcm-hdr"))
@@@ -661,8 -588,10 +664,10 @@@ static u32 bcm_sf2_sw_get_phy_flags(str
          * in bits 15:8 and the patch level in bits 7:0 which is exactly what
          * the REG_PHY_REVISION register layout is.
          */
- 
-       return priv->hw_params.gphy_rev;
+       if (priv->int_phy_mask & BIT(port))
+               return priv->hw_params.gphy_rev;
+       else
+               return 0;
   }
   
   static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
@@@ -718,7 -647,6 +723,7 @@@ static void bcm_sf2_sw_mac_config(struc
   {
         struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
         u32 id_mode_dis = 0, port_mode;
+ +      u32 reg_rgmii_ctrl;
         u32 reg;
   
         if (port == core_readl(priv, CORE_IMP0_PRT_ID))
@@@ -742,12 -670,10 +747,12 @@@
                 return;
         }
   
+ +      reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+ +
         /* Clear id_mode_dis bit, and the existing port mode, let
          * RGMII_MODE_EN bet set by mac_link_{up,down}
          */
- -      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ +      reg = reg_readl(priv, reg_rgmii_ctrl);
         reg &= ~ID_MODE_DIS;
         reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT);
   
@@@ -755,14 -681,13 +760,14 @@@
         if (id_mode_dis)
                 reg |= ID_MODE_DIS;
   
- -      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ +      reg_writel(priv, reg, reg_rgmii_ctrl);
   }
   
   static void bcm_sf2_sw_mac_link_set(struct dsa_switch *ds, int port,
                                     phy_interface_t interface, bool link)
   {
         struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
+ +      u32 reg_rgmii_ctrl;
         u32 reg;
   
         if (!phy_interface_mode_is_rgmii(interface) &&
@@@ -770,15 -695,13 +775,15 @@@
             interface != PHY_INTERFACE_MODE_REVMII)
                 return;
   
+ +      reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+ +
         /* If the link is down, just disable the interface to conserve power */
- -      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ +      reg = reg_readl(priv, reg_rgmii_ctrl);
         if (link)
                 reg |= RGMII_MODE_EN;
         else
                 reg &= ~RGMII_MODE_EN;
- -      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ +      reg_writel(priv, reg, reg_rgmii_ctrl);
   }
   
   static void bcm_sf2_sw_mac_link_down(struct dsa_switch *ds, int port,
@@@ -812,15 -735,11 +817,15 @@@ static void bcm_sf2_sw_mac_link_up(stru
   {
         struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
         struct ethtool_eee *p = &priv->dev->ports[port].eee;
- -      u32 reg, offset;
   
         bcm_sf2_sw_mac_link_set(ds, port, interface, true);
   
         if (port != core_readl(priv, CORE_IMP0_PRT_ID)) {
+ +              u32 reg_rgmii_ctrl;
+ +              u32 reg, offset;
+ +
+ +              reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
+ +
                 if (priv->type == BCM4908_DEVICE_ID ||
                     priv->type == BCM7445_DEVICE_ID)
                         offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
@@@ -831,7 -750,7 +836,7 @@@
                     interface == PHY_INTERFACE_MODE_RGMII_TXID ||
                     interface == PHY_INTERFACE_MODE_MII ||
                     interface == PHY_INTERFACE_MODE_REVMII) {
- -                      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
+ +                      reg = reg_readl(priv, reg_rgmii_ctrl);
                         reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN);
   
                         if (tx_pause)
@@@ -839,7 -758,7 +844,7 @@@
                         if (rx_pause)
                                 reg |= RX_PAUSE_EN;
   
- -                      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
+ +                      reg_writel(priv, reg, reg_rgmii_ctrl);
                 }
   
                 reg = SW_OVERRIDE | LINK_STS;
@@@ -942,8 -861,6 +947,8 @@@ static int bcm_sf2_sw_resume(struct dsa
                 return ret;
         }
   
+ +      bcm_sf2_crossbar_setup(priv);
+ +
         ret = bcm_sf2_cfp_resume(ds);
         if (ret)
                 return ret;
@@@ -1216,7 -1133,6 +1221,7 @@@ struct bcm_sf2_of_data 
         const u16 *reg_offsets;
         unsigned int core_reg_align;
         unsigned int num_cfp_rules;
+ +      unsigned int num_crossbar_int_ports;
   };
   
   static const u16 bcm_sf2_4908_reg_offsets[] = {
@@@ -1228,7 -1144,9 +1233,7 @@@
         [REG_PHY_REVISION]      = 0x14,
         [REG_SPHY_CNTRL]        = 0x24,
         [REG_CROSSBAR]          = 0xc8,
- -      [REG_RGMII_0_CNTRL]     = 0xe0,
- -      [REG_RGMII_1_CNTRL]     = 0xec,
- -      [REG_RGMII_2_CNTRL]     = 0xf8,
+ +      [REG_RGMII_11_CNTRL]    = 0x014c,
         [REG_LED_0_CNTRL]       = 0x40,
         [REG_LED_1_CNTRL]       = 0x4c,
         [REG_LED_2_CNTRL]       = 0x58,
@@@ -1238,8 -1156,7 +1243,8 @@@ static const struct bcm_sf2_of_data bcm
         .type           = BCM4908_DEVICE_ID,
         .core_reg_align = 0,
         .reg_offsets    = bcm_sf2_4908_reg_offsets,
- -      .num_cfp_rules  = 0, /* FIXME */
+ +      .num_cfp_rules  = 256,
+ +      .num_crossbar_int_ports = 2,
   };
   
   /* Register offsets for the SWITCH_REG_* block */
@@@ -1350,7 -1267,6 +1355,7 @@@ static int bcm_sf2_sw_probe(struct plat
         priv->reg_offsets = data->reg_offsets;
         priv->core_reg_align = data->core_reg_align;
         priv->num_cfp_rules = data->num_cfp_rules;
+ +      priv->num_crossbar_int_ports = data->num_crossbar_int_ports;
   
         priv->rcdev = devm_reset_control_get_optional_exclusive(&pdev->dev,
                                                                 "switch");
@@@ -1424,8 -1340,6 +1429,8 @@@
                 goto out_clk_mdiv;
         }
   
+ +      bcm_sf2_crossbar_setup(priv);
+ +
         bcm_sf2_gphy_enable_set(priv->dev->ds, true);
   
         ret = bcm_sf2_mdio_register(ds);
diff --combined drivers/net/dsa/mt7530.c

index 1278568,9871d7c..c442a58
--- 1/drivers/net/dsa/mt7530.c
--- 2/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@@ -436,34 -436,32 +436,32 @@@ mt7530_pad_clk_setup(struct dsa_switch 
                              TD_DM_DRVP(8) | TD_DM_DRVN(8));
   
         /* Setup core clock for MT7530 */
-       if (!trgint) {
-               /* Disable MT7530 core clock */
-               core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
- 
-               /* Disable PLL, since phy_device has not yet been created
-                * provided for phy_[read,write]_mmd_indirect is called, we
-                * provide our own core_write_mmd_indirect to complete this
-                * function.
-                */
-               core_write_mmd_indirect(priv,
-                                       CORE_GSWPLL_GRP1,
-                                       MDIO_MMD_VEND2,
-                                       0);
- 
-               /* Set core clock into 500Mhz */
-               core_write(priv, CORE_GSWPLL_GRP2,
-                          RG_GSWPLL_POSDIV_500M(1) |
-                          RG_GSWPLL_FBKDIV_500M(25));
- 
-               /* Enable PLL */
-               core_write(priv, CORE_GSWPLL_GRP1,
-                          RG_GSWPLL_EN_PRE |
-                          RG_GSWPLL_POSDIV_200M(2) |
-                          RG_GSWPLL_FBKDIV_200M(32));
- 
-               /* Enable MT7530 core clock */
-               core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-       }
+       /* Disable MT7530 core clock */
+       core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
+ 
+       /* Disable PLL, since phy_device has not yet been created
+        * provided for phy_[read,write]_mmd_indirect is called, we
+        * provide our own core_write_mmd_indirect to complete this
+        * function.
+        */
+       core_write_mmd_indirect(priv,
+                               CORE_GSWPLL_GRP1,
+                               MDIO_MMD_VEND2,
+                               0);
+ 
+       /* Set core clock into 500Mhz */
+       core_write(priv, CORE_GSWPLL_GRP2,
+                  RG_GSWPLL_POSDIV_500M(1) |
+                  RG_GSWPLL_FBKDIV_500M(25));
+ 
+       /* Enable PLL */
+       core_write(priv, CORE_GSWPLL_GRP1,
+                  RG_GSWPLL_EN_PRE |
+                  RG_GSWPLL_POSDIV_200M(2) |
+                  RG_GSWPLL_FBKDIV_200M(32));
+ 
+       /* Enable MT7530 core clock */
+       core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
   
         /* Setup the MT7530 TRGMII Tx Clock */
         core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
@@@ -999,9 -997,8 +997,9 @@@ mt753x_cpu_port_enable(struct dsa_switc
         mt7530_write(priv, MT7530_PVC_P(port),
                      PORT_SPEC_TAG);
   
- -      /* Unknown multicast frame forwarding to the cpu port */
- -      mt7530_rmw(priv, MT7530_MFC, UNM_FFP_MASK, UNM_FFP(BIT(port)));
+ +      /* Disable flooding by default */
+ +      mt7530_rmw(priv, MT7530_MFC, BC_FFP_MASK | UNM_FFP_MASK | UNU_FFP_MASK,
+ +                 BC_FFP(BIT(port)) | UNM_FFP(BIT(port)) | UNU_FFP(BIT(port)));
   
         /* Set CPU port number */
         if (priv->id == ID_MT7621)
@@@ -1138,56 -1135,6 +1136,56 @@@ mt7530_stp_state_set(struct dsa_switch 
         mt7530_rmw(priv, MT7530_SSP_P(port), FID_PST_MASK, stp_state);
   }
   
+ +static int
+ +mt7530_port_pre_bridge_flags(struct dsa_switch *ds, int port,
+ +                           struct switchdev_brport_flags flags,
+ +                           struct netlink_ext_ack *extack)
+ +{
+ +      if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
+ +                         BR_BCAST_FLOOD))
+ +              return -EINVAL;
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +mt7530_port_bridge_flags(struct dsa_switch *ds, int port,
+ +                       struct switchdev_brport_flags flags,
+ +                       struct netlink_ext_ack *extack)
+ +{
+ +      struct mt7530_priv *priv = ds->priv;
+ +
+ +      if (flags.mask & BR_LEARNING)
+ +              mt7530_rmw(priv, MT7530_PSC_P(port), SA_DIS,
+ +                         flags.val & BR_LEARNING ? 0 : SA_DIS);
+ +
+ +      if (flags.mask & BR_FLOOD)
+ +              mt7530_rmw(priv, MT7530_MFC, UNU_FFP(BIT(port)),
+ +                         flags.val & BR_FLOOD ? UNU_FFP(BIT(port)) : 0);
+ +
+ +      if (flags.mask & BR_MCAST_FLOOD)
+ +              mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
+ +                         flags.val & BR_MCAST_FLOOD ? UNM_FFP(BIT(port)) : 0);
+ +
+ +      if (flags.mask & BR_BCAST_FLOOD)
+ +              mt7530_rmw(priv, MT7530_MFC, BC_FFP(BIT(port)),
+ +                         flags.val & BR_BCAST_FLOOD ? BC_FFP(BIT(port)) : 0);
+ +
+ +      return 0;
+ +}
+ +
+ +static int
+ +mt7530_port_set_mrouter(struct dsa_switch *ds, int port, bool mrouter,
+ +                      struct netlink_ext_ack *extack)
+ +{
+ +      struct mt7530_priv *priv = ds->priv;
+ +
+ +      mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
+ +                 mrouter ? UNM_FFP(BIT(port)) : 0);
+ +
+ +      return 0;
+ +}
+ +
   static int
   mt7530_port_bridge_join(struct dsa_switch *ds, int port,
                         struct net_device *bridge)
@@@ -1399,59 -1346,6 +1397,59 @@@ err
         return 0;
   }
   
+ +static int
+ +mt7530_port_mdb_add(struct dsa_switch *ds, int port,
+ +                  const struct switchdev_obj_port_mdb *mdb)
+ +{
+ +      struct mt7530_priv *priv = ds->priv;
+ +      const u8 *addr = mdb->addr;
+ +      u16 vid = mdb->vid;
+ +      u8 port_mask = 0;
+ +      int ret;
+ +
+ +      mutex_lock(&priv->reg_mutex);
+ +
+ +      mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
+ +      if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
+ +              port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
+ +                          & PORT_MAP_MASK;
+ +
+ +      port_mask |= BIT(port);
+ +      mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT);
+ +      ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
+ +
+ +      mutex_unlock(&priv->reg_mutex);
+ +
+ +      return ret;
+ +}
+ +
+ +static int
+ +mt7530_port_mdb_del(struct dsa_switch *ds, int port,
+ +                  const struct switchdev_obj_port_mdb *mdb)
+ +{
+ +      struct mt7530_priv *priv = ds->priv;
+ +      const u8 *addr = mdb->addr;
+ +      u16 vid = mdb->vid;
+ +      u8 port_mask = 0;
+ +      int ret;
+ +
+ +      mutex_lock(&priv->reg_mutex);
+ +
+ +      mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
+ +      if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
+ +              port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
+ +                          & PORT_MAP_MASK;
+ +
+ +      port_mask &= ~BIT(port);
+ +      mt7530_fdb_write(priv, vid, port_mask, addr, -1,
+ +                       port_mask ? STATIC_ENT : STATIC_EMP);
+ +      ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
+ +
+ +      mutex_unlock(&priv->reg_mutex);
+ +
+ +      return ret;
+ +}
+ +
   static int
   mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid)
   {
@@@ -1924,12 -1818,9 +1922,12 @@@ mt7530_setup(struct dsa_switch *ds
                         ret = mt753x_cpu_port_enable(ds, i);
                         if (ret)
                                 return ret;
- -              } else
+ +              } else {
                         mt7530_port_disable(ds, i);
   
+ +                      /* Disable learning by default on all user ports */
+ +                      mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
+ +              }
                 /* Enable consistent egress tag */
                 mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
                            PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
@@@ -2091,13 -1982,9 +2089,13 @@@ mt7531_setup(struct dsa_switch *ds
                         ret = mt753x_cpu_port_enable(ds, i);
                         if (ret)
                                 return ret;
- -              } else
+ +              } else {
                         mt7530_port_disable(ds, i);
   
+ +                      /* Disable learning by default on all user ports */
+ +                      mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
+ +              }
+ +
                 /* Enable consistent egress tag */
                 mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
                            PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
@@@ -2819,16 -2706,11 +2817,16 @@@ static const struct dsa_switch_ops mt75
         .port_change_mtu        = mt7530_port_change_mtu,
         .port_max_mtu           = mt7530_port_max_mtu,
         .port_stp_state_set     = mt7530_stp_state_set,
+ +      .port_pre_bridge_flags  = mt7530_port_pre_bridge_flags,
+ +      .port_bridge_flags      = mt7530_port_bridge_flags,
+ +      .port_set_mrouter       = mt7530_port_set_mrouter,
         .port_bridge_join       = mt7530_port_bridge_join,
         .port_bridge_leave      = mt7530_port_bridge_leave,
         .port_fdb_add           = mt7530_port_fdb_add,
         .port_fdb_del           = mt7530_port_fdb_del,
         .port_fdb_dump          = mt7530_port_fdb_dump,
+ +      .port_mdb_add           = mt7530_port_mdb_add,
+ +      .port_mdb_del           = mt7530_port_mdb_del,
         .port_vlan_filtering    = mt7530_port_vlan_filtering,
         .port_vlan_add          = mt7530_port_vlan_add,
         .port_vlan_del          = mt7530_port_vlan_del,
diff --combined drivers/net/ethernet/intel/e1000e/netdev.c

index 31b8726,a094800..88e9035
--- 1/drivers/net/ethernet/intel/e1000e/netdev.c
--- 2/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@@ -25,7 -25,6 +25,7 @@@
   #include <linux/pm_runtime.h>
   #include <linux/aer.h>
   #include <linux/prefetch.h>
+ +#include <linux/suspend.h>
   
   #include "e1000.h"
   
@@@ -5975,19 -5974,23 +5975,23 @@@ static void e1000_reset_task(struct wor
         struct e1000_adapter *adapter;
         adapter = container_of(work, struct e1000_adapter, reset_task);
   
+       rtnl_lock();
         /* don't run the task if already down */
-       if (test_bit(__E1000_DOWN, &adapter->state))
+       if (test_bit(__E1000_DOWN, &adapter->state)) {
+               rtnl_unlock();
                 return;
+       }
   
         if (!(adapter->flags & FLAG_RESTART_NOW)) {
                 e1000e_dump(adapter);
                 e_err("Reset adapter unexpectedly\n");
         }
         e1000e_reinit_locked(adapter);
+       rtnl_unlock();
   }
   
   /**
- - * e1000_get_stats64 - Get System Network Statistics
+ + * e1000e_get_stats64 - Get System Network Statistics
    * @netdev: network interface device structure
    * @stats: rtnl_link_stats64 pointer
    *
@@@ -6160,7 -6163,7 +6164,7 @@@ static int e1000_mii_ioctl(struct net_d
   }
   
   /**
- - * e1000e_hwtstamp_ioctl - control hardware time stamping
+ + * e1000e_hwtstamp_set - control hardware time stamping
    * @netdev: network interface device structure
    * @ifr: interface request
    *
@@@ -6818,7 -6821,7 +6822,7 @@@ static void e1000e_disable_aspm(struct 
   }
   
   /**
- - * e1000e_disable_aspm_locked   Disable ASPM states.
+ + * e1000e_disable_aspm_locked - Disable ASPM states.
    * @pdev: pointer to PCI device struct
    * @state: bit-mask of ASPM states to disable
    *
@@@ -6919,12 -6922,6 +6923,12 @@@ static int __e1000_resume(struct pci_de
         return 0;
   }
   
+ +static __maybe_unused int e1000e_pm_prepare(struct device *dev)
+ +{
+ +      return pm_runtime_suspended(dev) &&
+ +              pm_suspend_via_firmware();
+ +}
+ +
   static __maybe_unused int e1000e_pm_suspend(struct device *dev)
   {
         struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
@@@ -7633,9 -7630,9 +7637,9 @@@ static int e1000_probe(struct pci_dev *
   
         e1000_print_device_info(adapter);
   
- -      dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE);
+ +      dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE);
   
- -      if (pci_dev_run_wake(pdev) && hw->mac.type < e1000_pch_cnp)
+ +      if (pci_dev_run_wake(pdev) && hw->mac.type != e1000_pch_cnp)
                 pm_runtime_put_noidle(&pdev->dev);
   
         return 0;
@@@ -7858,7 -7855,6 +7862,7 @@@ MODULE_DEVICE_TABLE(pci, e1000_pci_tbl)
   
   static const struct dev_pm_ops e1000_pm_ops = {
   #ifdef CONFIG_PM_SLEEP
+ +      .prepare        = e1000e_pm_prepare,
         .suspend        = e1000e_pm_suspend,
         .resume         = e1000e_pm_resume,
         .freeze         = e1000e_pm_freeze,
diff --combined drivers/net/ethernet/intel/i40e/i40e_main.c

index 14a1bad,17f3b80..0f84ed0
--- 1/drivers/net/ethernet/intel/i40e/i40e_main.c
--- 2/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@@ -2023,7 -2023,7 +2023,7 @@@ static void i40e_undo_add_filter_entrie
   }
   
   /**
- - * i40e_next_entry - Get the next non-broadcast filter from a list
+ + * i40e_next_filter - Get the next non-broadcast filter from a list
    * @next: pointer to filter in list
    *
    * Returns the next non-broadcast filter in the list. Required so that we
@@@ -3258,6 -3258,17 +3258,17 @@@ static int i40e_configure_tx_ring(struc
         return 0;
   }
   
+ /**
+  * i40e_rx_offset - Return expected offset into page to access data
+  * @rx_ring: Ring we are requesting offset of
+  *
+  * Returns the offset value for ring into the data buffer.
+  */
+ static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+ {
+       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+ }
+ 
   /**
    * i40e_configure_rx_ring - Configure a receive ring context
    * @ring: The Rx ring to configure
@@@ -3369,6 -3380,8 +3380,8 @@@ static int i40e_configure_rx_ring(struc
         else
                 set_ring_build_skb_enabled(ring);
   
+       ring->rx_offset = i40e_rx_offset(ring);
+ 
         /* cache tail for quicker writes, and clear the reg before use */
         ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
         writel(0, ring->tail);
@@@ -5191,7 -5204,7 +5204,7 @@@ static u8 i40e_pf_get_num_tc(struct i40
   }
   
   /**
- - * i40e_pf_get_pf_tc_map - Get bitmap for enabled traffic classes
+ + * i40e_pf_get_tc_map - Get bitmap for enabled traffic classes
    * @pf: PF being queried
    *
    * Return a bitmap for enabled traffic classes for this PF.
@@@ -9454,7 -9467,7 +9467,7 @@@ static void i40e_fdir_flush_and_replay(
   }
   
   /**
- - * i40e_get_current_atr_count - Get the count of total FD ATR filters programmed
+ + * i40e_get_current_atr_cnt - Get the count of total FD ATR filters programmed
    * @pf: board private structure
    **/
   u32 i40e_get_current_atr_cnt(struct i40e_pf *pf)
diff --combined drivers/net/ethernet/intel/i40e/i40e_txrx.c

index 895f59a,5747a99..8b58209
--- 1/drivers/net/ethernet/intel/i40e/i40e_txrx.c
--- 2/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@@ -1569,17 -1569,6 +1569,6 @@@ void i40e_free_rx_resources(struct i40e
         }
   }
   
- /**
-  * i40e_rx_offset - Return expected offset into page to access data
-  * @rx_ring: Ring we are requesting offset of
-  *
-  * Returns the offset value for ring into the data buffer.
-  */
- static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
- {
-       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
- }
- 
   /**
    * i40e_setup_rx_descriptors - Allocate Rx descriptors
    * @rx_ring: Rx descriptor ring (for a specific queue) to setup
@@@ -1608,7 -1597,6 +1597,6 @@@ int i40e_setup_rx_descriptors(struct i4
         rx_ring->next_to_alloc = 0;
         rx_ring->next_to_clean = 0;
         rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = i40e_rx_offset(rx_ring);
   
         /* XDP RX-queue info only needed for RX rings exposed to XDP */
         if (rx_ring->vsi->type == I40E_VSI_MAIN) {
@@@ -3345,7 -3333,7 +3333,7 @@@ static int i40e_tx_enable_csum(struct s
   }
   
   /**
- - * i40e_create_tx_ctx Build the Tx context descriptor
+ + * i40e_create_tx_ctx - Build the Tx context descriptor
    * @tx_ring:  ring to create the descriptor on
    * @cd_type_cmd_tso_mss: Quad Word 1
    * @cd_tunneling: Quad Word 0 - bits 0-31
diff --combined drivers/net/ethernet/intel/ice/ice_txrx.c

index 6d87dd9,b91dcfd..3148e78
--- 1/drivers/net/ethernet/intel/ice/ice_txrx.c
--- 2/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@@ -443,22 -443,6 +443,6 @@@ void ice_free_rx_ring(struct ice_ring *
         }
   }
   
- /**
-  * ice_rx_offset - Return expected offset into page to access data
-  * @rx_ring: Ring we are requesting offset of
-  *
-  * Returns the offset value for ring into the data buffer.
-  */
- static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
- {
-       if (ice_ring_uses_build_skb(rx_ring))
-               return ICE_SKB_PAD;
-       else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
-               return XDP_PACKET_HEADROOM;
- 
-       return 0;
- }
- 
   /**
    * ice_setup_rx_ring - Allocate the Rx descriptors
    * @rx_ring: the Rx ring to set up
@@@ -493,7 -477,6 +477,6 @@@ int ice_setup_rx_ring(struct ice_ring *
   
         rx_ring->next_to_use = 0;
         rx_ring->next_to_clean = 0;
-       rx_ring->rx_offset = ice_rx_offset(rx_ring);
   
         if (ice_is_xdp_ena_vsi(rx_ring->vsi))
                 WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
@@@ -1115,11 -1098,6 +1098,11 @@@ int ice_clean_rx_irq(struct ice_ring *r
                 dma_rmb();
   
                 if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
+ +                      struct ice_vsi *ctrl_vsi = rx_ring->vsi;
+ +
+ +                      if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+ +                          ctrl_vsi->vf_id != ICE_INVAL_VFID)
+ +                              ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
                         ice_put_rx_buf(rx_ring, NULL, 0);
                         cleaned_count++;
                         continue;
diff --combined drivers/net/ethernet/intel/ice/ice_xsk.c

index 727f277,9f94d91..17ab8ef
--- 1/drivers/net/ethernet/intel/ice/ice_xsk.c
--- 2/drivers/net/ethernet/intel/ice/ice_xsk.c
+++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
@@@ -358,18 -358,18 +358,18 @@@ xsk_pool_if_up
    * This function allocates a number of Rx buffers from the fill ring
    * or the internal recycle mechanism and places them on the Rx ring.
    *
-  * Returns false if all allocations were successful, true if any fail.
+  * Returns true if all allocations were successful, false if any fail.
    */
   bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
   {
         union ice_32b_rx_flex_desc *rx_desc;
         u16 ntu = rx_ring->next_to_use;
         struct ice_rx_buf *rx_buf;
-       bool ret = false;
+       bool ok = true;
         dma_addr_t dma;
   
         if (!count)
-               return false;
+               return true;
   
         rx_desc = ICE_RX_DESC(rx_ring, ntu);
         rx_buf = &rx_ring->rx_buf[ntu];
@@@ -377,7 -377,7 +377,7 @@@
         do {
                 rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
                 if (!rx_buf->xdp) {
-                       ret = true;
+                       ok = false;
                         break;
                 }
   
@@@ -402,7 -402,7 +402,7 @@@
                 ice_release_rx_desc(rx_ring, ntu);
         }
   
-       return ret;
+       return ok;
   }
   
   /**
@@@ -473,14 -473,6 +473,14 @@@ ice_run_xdp_zc(struct ice_ring *rx_ring
         xdp_prog = READ_ONCE(rx_ring->xdp_prog);
   
         act = bpf_prog_run_xdp(xdp_prog, xdp);
+ +
+ +      if (likely(act == XDP_REDIRECT)) {
+ +              err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+ +              result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
+ +              rcu_read_unlock();
+ +              return result;
+ +      }
+ +
         switch (act) {
         case XDP_PASS:
                 break;
@@@ -488,6 -480,10 +488,6 @@@
                 xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
                 result = ice_xmit_xdp_buff(xdp, xdp_ring);
                 break;
- -      case XDP_REDIRECT:
- -              err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
- -              result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
- -              break;
         default:
                 bpf_warn_invalid_xdp_action(act);
                 fallthrough;
diff --combined drivers/net/ethernet/intel/igb/igb_main.c

index 854d19f,a45cd2b..b83966a
--- 1/drivers/net/ethernet/intel/igb/igb_main.c
--- 2/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@@ -2037,7 -2037,7 +2037,7 @@@ static void igb_power_down_link(struct 
   }
   
   /**
- - * Detect and switch function for Media Auto Sense
+ + * igb_check_swap_media -  Detect and switch function for Media Auto Sense
    * @adapter: address of the board private structure
    **/
   static void igb_check_swap_media(struct igb_adapter *adapter)
@@@ -3115,7 -3115,7 +3115,7 @@@ static s32 igb_init_i2c(struct igb_adap
                 return 0;
   
         /* Initialize the i2c bus which is controlled by the registers.
- -       * This bus will use the i2c_algo_bit structue that implements
+ +       * This bus will use the i2c_algo_bit structure that implements
          * the protocol through toggling of the 4 bits in the register.
          */
         adapter->i2c_adap.owner = THIS_MODULE;
@@@ -4020,7 -4020,7 +4020,7 @@@ static int igb_sw_init(struct igb_adapt
   }
   
   /**
- - *  igb_open - Called when a network interface is made active
+ + *  __igb_open - Called when a network interface is made active
    *  @netdev: network interface device structure
    *  @resuming: indicates whether we are in a resume call
    *
@@@ -4138,7 -4138,7 +4138,7 @@@ int igb_open(struct net_device *netdev
   }
   
   /**
- - *  igb_close - Disables a network interface
+ + *  __igb_close - Disables a network interface
    *  @netdev: network interface device structure
    *  @suspending: indicates we are in a suspend call
    *
@@@ -5856,7 -5856,7 +5856,7 @@@ static void igb_tx_ctxtdesc(struct igb_
          */
         if (tx_ring->launchtime_enable) {
                 ts = ktime_to_timespec64(first->skb->tstamp);
- -              first->skb->tstamp = ktime_set(0, 0);
+ +              skb_txtime_consumed(first->skb);
                 context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32);
         } else {
                 context_desc->seqnum_seed = 0;
@@@ -8214,7 -8214,8 +8214,8 @@@ static void igb_reuse_rx_page(struct ig
         new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
   }
   
- static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+                                 int rx_buf_pgcnt)
   {
         unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
         struct page *page = rx_buffer->page;
@@@ -8225,7 -8226,7 +8226,7 @@@
   
   #if (PAGE_SIZE < 8192)
         /* if we are only owner of page we can reuse it */
-       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
                 return false;
   #else
   #define IGB_LAST_OFFSET \
@@@ -8301,9 -8302,10 +8302,10 @@@ static struct sk_buff *igb_construct_sk
                 return NULL;
   
         if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
-               xdp->data += IGB_TS_HDR_LEN;
-               size -= IGB_TS_HDR_LEN;
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb)) {
+                       xdp->data += IGB_TS_HDR_LEN;
+                       size -= IGB_TS_HDR_LEN;
+               }
         }
   
         /* Determine available headroom for copy */
@@@ -8364,8 -8366,8 +8366,8 @@@ static struct sk_buff *igb_build_skb(st
   
         /* pull timestamp out of packet data */
         if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb);
-               __skb_pull(skb, IGB_TS_HDR_LEN);
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb))
+                       __skb_pull(skb, IGB_TS_HDR_LEN);
         }
   
         /* update buffer offset */
@@@ -8614,11 -8616,17 +8616,17 @@@ static unsigned int igb_rx_offset(struc
   }
   
   static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-                                              const unsigned int size)
+                                              const unsigned int size, int *rx_buf_pgcnt)
   {
         struct igb_rx_buffer *rx_buffer;
   
         rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+       *rx_buf_pgcnt =
+ #if (PAGE_SIZE < 8192)
+               page_count(rx_buffer->page);
+ #else
+               0;
+ #endif
         prefetchw(rx_buffer->page);
   
         /* we are reusing so sync this buffer for CPU use */
@@@ -8634,9 -8642,9 +8642,9 @@@
   }
   
   static void igb_put_rx_buffer(struct igb_ring *rx_ring,
-                             struct igb_rx_buffer *rx_buffer)
+                             struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
   {
-       if (igb_can_reuse_rx_page(rx_buffer)) {
+       if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
                 /* hand second half of page back to the ring */
                 igb_reuse_rx_page(rx_ring, rx_buffer);
         } else {
@@@ -8664,6 -8672,7 +8672,7 @@@ static int igb_clean_rx_irq(struct igb_
         unsigned int xdp_xmit = 0;
         struct xdp_buff xdp;
         u32 frame_sz = 0;
+       int rx_buf_pgcnt;
   
         /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
   #if (PAGE_SIZE < 8192)
@@@ -8693,7 -8702,7 +8702,7 @@@
                  */
                 dma_rmb();
   
-               rx_buffer = igb_get_rx_buffer(rx_ring, size);
+               rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
   
                 /* retrieve a buffer from the ring */
                 if (!skb) {
@@@ -8736,7 -8745,7 +8745,7 @@@
                         break;
                 }
   
-               igb_put_rx_buffer(rx_ring, rx_buffer);
+               igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
                 cleaned_count++;
   
                 /* fetch next buffer in frame if non-eop */
diff --combined drivers/net/ethernet/intel/igb/igb_ptp.c

index f3ff565,86a5762..ba61fe9
--- 1/drivers/net/ethernet/intel/igb/igb_ptp.c
--- 2/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@@ -856,6 -856,9 +856,9 @@@ static void igb_ptp_tx_hwtstamp(struct 
         dev_kfree_skb_any(skb);
   }
   
+ #define IGB_RET_PTP_DISABLED 1
+ #define IGB_RET_PTP_INVALID 2
+ 
   /**
    * igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp
    * @q_vector: Pointer to interrupt specific structure
@@@ -864,19 -867,29 +867,29 @@@
    *
    * This function is meant to retrieve a timestamp from the first buffer of an
    * incoming frame.  The value is stored in little endian format starting on
-  * byte 8.
+  * byte 8
+  *
+  * Returns: 0 if success, nonzero if failure
    **/
- void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
-                        struct sk_buff *skb)
+ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+                       struct sk_buff *skb)
   {
-       __le64 *regval = (__le64 *)va;
         struct igb_adapter *adapter = q_vector->adapter;
+       __le64 *regval = (__le64 *)va;
         int adjust = 0;
   
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return IGB_RET_PTP_DISABLED;
+ 
         /* The timestamp is recorded in little endian format.
          * DWORD: 0        1        2        3
          * Field: Reserved Reserved SYSTIML  SYSTIMH
          */
+ 
+       /* check reserved dwords are zero, be/le doesn't matter for zero */
+       if (regval[0])
+               return IGB_RET_PTP_INVALID;
+ 
         igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb),
                                    le64_to_cpu(regval[1]));
   
@@@ -896,6 -909,8 +909,8 @@@
         }
         skb_hwtstamps(skb)->hwtstamp =
                 ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
+ 
+       return 0;
   }
   
   /**
@@@ -906,13 -921,15 +921,15 @@@
    * This function is meant to retrieve a timestamp from the internal registers
    * of the adapter and store it in the skb.
    **/
- void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
-                        struct sk_buff *skb)
+ void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
   {
         struct igb_adapter *adapter = q_vector->adapter;
         struct e1000_hw *hw = &adapter->hw;
-       u64 regval;
         int adjust = 0;
+       u64 regval;
+ 
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return;
   
         /* If this bit is set, then the RX registers contain the time stamp. No
          * other packet will be time stamped until we read these registers, so
@@@ -1008,7 -1025,6 +1025,7 @@@ static int igb_ptp_set_timestamp_mode(s
         switch (config->tx_type) {
         case HWTSTAMP_TX_OFF:
                 tsync_tx_ctl = 0;
+ +              break;
         case HWTSTAMP_TX_ON:
                 break;
         default:
diff --combined drivers/net/ethernet/intel/igc/igc_main.c

index a476837,4d989eb..baa45a1
--- 1/drivers/net/ethernet/intel/igc/igc_main.c
--- 2/drivers/net/ethernet/intel/igc/igc_main.c
+++ b/drivers/net/ethernet/intel/igc/igc_main.c
@@@ -941,7 -941,7 +941,7 @@@ static void igc_tx_ctxtdesc(struct igc_
                 struct igc_adapter *adapter = netdev_priv(tx_ring->netdev);
                 ktime_t txtime = first->skb->tstamp;
   
- -              first->skb->tstamp = ktime_set(0, 0);
+ +              skb_txtime_consumed(first->skb);
                 context_desc->launch_time = igc_tx_launchtime(adapter,
                                                               txtime);
         } else {
@@@ -3580,7 -3580,7 +3580,7 @@@ void igc_up(struct igc_adapter *adapter
         netif_tx_start_all_queues(adapter->netdev);
   
         /* start the watchdog. */
- -      hw->mac.get_link_status = 1;
+ +      hw->mac.get_link_status = true;
         schedule_work(&adapter->watchdog_task);
   }
   
@@@ -3831,10 -3831,19 +3831,19 @@@ static void igc_reset_task(struct work_
   
         adapter = container_of(work, struct igc_adapter, reset_task);
   
+       rtnl_lock();
+       /* If we're already down or resetting, just bail */
+       if (test_bit(__IGC_DOWN, &adapter->state) ||
+           test_bit(__IGC_RESETTING, &adapter->state)) {
+               rtnl_unlock();
+               return;
+       }
+ 
         igc_rings_dump(adapter);
         igc_regs_dump(adapter);
         netdev_err(adapter->netdev, "Reset adapter\n");
         igc_reinit_locked(adapter);
+       rtnl_unlock();
   }
   
   /**
@@@ -4000,7 -4009,7 +4009,7 @@@ static irqreturn_t igc_msix_other(int i
         }
   
         if (icr & IGC_ICR_LSC) {
- -              hw->mac.get_link_status = 1;
+ +              hw->mac.get_link_status = true;
                 /* guard against interrupt when we're going down */
                 if (!test_bit(__IGC_DOWN, &adapter->state))
                         mod_timer(&adapter->watchdog_timer, jiffies + 1);
@@@ -4378,7 -4387,7 +4387,7 @@@ static irqreturn_t igc_intr_msi(int irq
         }
   
         if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
- -              hw->mac.get_link_status = 1;
+ +              hw->mac.get_link_status = true;
                 if (!test_bit(__IGC_DOWN, &adapter->state))
                         mod_timer(&adapter->watchdog_timer, jiffies + 1);
         }
@@@ -4420,7 -4429,7 +4429,7 @@@ static irqreturn_t igc_intr(int irq, vo
         }
   
         if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
- -              hw->mac.get_link_status = 1;
+ +              hw->mac.get_link_status = true;
                 /* guard against interrupt when we're going down */
                 if (!test_bit(__IGC_DOWN, &adapter->state))
                         mod_timer(&adapter->watchdog_timer, jiffies + 1);
@@@ -4574,7 -4583,7 +4583,7 @@@ static int __igc_open(struct net_devic
         netif_tx_start_all_queues(netdev);
   
         /* start the watchdog. */
- -      hw->mac.get_link_status = 1;
+ +      hw->mac.get_link_status = true;
         schedule_work(&adapter->watchdog_task);
   
         return IGC_SUCCESS;
@@@ -4915,7 -4924,7 +4924,7 @@@ int igc_set_spd_dplx(struct igc_adapte
   {
         struct igc_mac_info *mac = &adapter->hw.mac;
   
- -      mac->autoneg = 0;
+ +      mac->autoneg = false;
   
         /* Make sure dplx is at most 1 bit and lsb of speed is not set
          * for the switch() below to work
@@@ -4937,13 -4946,13 +4946,13 @@@
                 mac->forced_speed_duplex = ADVERTISE_100_FULL;
                 break;
         case SPEED_1000 + DUPLEX_FULL:
- -              mac->autoneg = 1;
+ +              mac->autoneg = true;
                 adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
                 break;
         case SPEED_1000 + DUPLEX_HALF: /* not supported */
                 goto err_inval;
         case SPEED_2500 + DUPLEX_FULL:
- -              mac->autoneg = 1;
+ +              mac->autoneg = true;
                 adapter->hw.phy.autoneg_advertised = ADVERTISE_2500_FULL;
                 break;
         case SPEED_2500 + DUPLEX_HALF: /* not supported */
diff --combined drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

index 4c90f83,03d9aad..19fe211
--- 1/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
--- 2/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@@ -225,7 -225,7 +225,7 @@@ static s32 ixgbe_get_parent_bus_info(st
   }
   
   /**
- - * ixgbe_check_from_parent - Determine whether PCIe info should come from parent
+ + * ixgbe_pcie_from_parent - Determine whether PCIe info should come from parent
    * @hw: hw specific details
    *
    * This function is used by probe to determine whether a device's PCI-Express
@@@ -4118,6 -4118,8 +4118,8 @@@ void ixgbe_configure_rx_ring(struct ixg
   #endif
         }
   
+       ring->rx_offset = ixgbe_rx_offset(ring);
+ 
         if (ring->xsk_pool && hw->mac.type != ixgbe_mac_82599EB) {
                 u32 xsk_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
   
@@@ -6156,7 -6158,7 +6158,7 @@@ void ixgbe_down(struct ixgbe_adapter *a
   }
   
   /**
- - * ixgbe_eee_capable - helper function to determine EEE support on X550
+ + * ixgbe_set_eee_capable - helper function to determine EEE support on X550
    * @adapter: board private structure
    */
   static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter)
@@@ -6578,7 -6580,6 +6580,6 @@@ int ixgbe_setup_rx_resources(struct ixg
   
         rx_ring->next_to_clean = 0;
         rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = ixgbe_rx_offset(rx_ring);
   
         /* XDP RX-queue info */
         if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
diff --combined drivers/net/ethernet/marvell/octeontx2/af/rvu.h

index baaba01,76f3992..c2cc480
--- 1/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
--- 2/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h
@@@ -548,12 -548,6 +548,12 @@@ static inline int is_afvf(u16 pcifunc
         return !(pcifunc & ~RVU_PFVF_FUNC_MASK);
   }
   
+ +/* check if PF_FUNC is AF */
+ +static inline bool is_pffunc_af(u16 pcifunc)
+ +{
+ +      return !pcifunc;
+ +}
+ +
   static inline bool is_rvu_fwdata_valid(struct rvu *rvu)
   {
         return (rvu->fwdata->header_magic == RVU_FWDATA_HEADER_MAGIC) &&
@@@ -646,8 -640,7 +646,8 @@@ int npc_config_ts_kpuaction(struct rvu 
   void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
                                  int nixlf, u64 chan, u8 *mac_addr);
   void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
- -                                 int nixlf, u64 chan, bool allmulti);
+ +                                 int nixlf, u64 chan, u8 chan_cnt,
+ +                                 bool allmulti);
   void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
   void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
   void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
@@@ -672,6 -665,9 +672,6 @@@ int rvu_npc_get_tx_nibble_cfg(struct rv
   int npc_mcam_verify_channel(struct rvu *rvu, u16 pcifunc, u8 intf, u16 channel);
   int npc_flow_steering_init(struct rvu *rvu, int blkaddr);
   const char *npc_get_field_name(u8 hdr);
- -bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
- -                              u16 pcifunc, u8 intf, struct mcam_entry *entry,
- -                              int *entry_index);
   int npc_get_bank(struct npc_mcam *mcam, int index);
   void npc_mcam_enable_flows(struct rvu *rvu, u16 target);
   void npc_mcam_disable_flows(struct rvu *rvu, u16 target);
@@@ -682,12 -678,8 +682,13 @@@ void npc_read_mcam_entry(struct rvu *rv
                          u8 *intf, u8 *ena);
   bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
   u32  rvu_cgx_get_fifolen(struct rvu *rvu);
+ void *rvu_first_cgx_pdata(struct rvu *rvu);
   
+ +int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
+ +                           int type);
+ +bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr,
+ +                         int index);
+ +
   /* CPT APIs */
   int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
   
diff --combined drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c

index 741da11,de3968d..9bf8eaa
--- 1/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
--- 2/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
@@@ -234,12 -234,14 +234,14 @@@ static ssize_t rvu_dbg_rsrc_attach_stat
                                           char __user *buffer,
                                           size_t count, loff_t *ppos)
   {
-       int index, off = 0, flag = 0, go_back = 0, off_prev;
+       int index, off = 0, flag = 0, go_back = 0, len = 0;
         struct rvu *rvu = filp->private_data;
         int lf, pf, vf, pcifunc;
         struct rvu_block block;
         int bytes_not_copied;
+       int lf_str_size = 12;
         int buf_size = 2048;
+       char *lfs;
         char *buf;
   
         /* don't allow partial reads */
@@@ -249,12 -251,20 +251,20 @@@
         buf = kzalloc(buf_size, GFP_KERNEL);
         if (!buf)
                 return -ENOSPC;
-       off +=  scnprintf(&buf[off], buf_size - 1 - off, "\npcifunc\t\t");
+ 
+       lfs = kzalloc(lf_str_size, GFP_KERNEL);
+       if (!lfs) {
+               kfree(buf);
+               return -ENOMEM;
+       }
+       off +=  scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size,
+                         "pcifunc");
         for (index = 0; index < BLK_COUNT; index++)
-               if (strlen(rvu->hw->block[index].name))
-                       off +=  scnprintf(&buf[off], buf_size - 1 - off,
-                                         "%*s\t", (index - 1) * 2,
-                                         rvu->hw->block[index].name);
+               if (strlen(rvu->hw->block[index].name)) {
+                       off += scnprintf(&buf[off], buf_size - 1 - off,
+                                        "%-*s", lf_str_size,
+                                        rvu->hw->block[index].name);
+               }
         off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
         for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
                 for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
@@@ -263,14 -273,15 +273,15 @@@
                                 continue;
   
                         if (vf) {
+                               sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
                                 go_back = scnprintf(&buf[off],
                                                     buf_size - 1 - off,
-                                                   "PF%d:VF%d\t\t", pf,
-                                                   vf - 1);
+                                                   "%-*s", lf_str_size, lfs);
                         } else {
+                               sprintf(lfs, "PF%d", pf);
                                 go_back = scnprintf(&buf[off],
                                                     buf_size - 1 - off,
-                                                   "PF%d\t\t", pf);
+                                                   "%-*s", lf_str_size, lfs);
                         }
   
                         off += go_back;
@@@ -278,20 -289,22 +289,22 @@@
                                 block = rvu->hw->block[index];
                                 if (!strlen(block.name))
                                         continue;
-                               off_prev = off;
+                               len = 0;
+                               lfs[len] = '\0';
                                 for (lf = 0; lf < block.lf.max; lf++) {
                                         if (block.fn_map[lf] != pcifunc)
                                                 continue;
                                         flag = 1;
-                                       off += scnprintf(&buf[off], buf_size - 1
-                                                       - off, "%3d,", lf);
+                                       len += sprintf(&lfs[len], "%d,", lf);
                                 }
-                               if (flag && off_prev != off)
-                                       off--;
-                               else
-                                       go_back++;
+ 
+                               if (flag)
+                                       len--;
+                               lfs[len] = '\0';
                                 off += scnprintf(&buf[off], buf_size - 1 - off,
-                                               "\t");
+                                                "%-*s", lf_str_size, lfs);
+                               if (!strlen(lfs))
+                                       go_back += lf_str_size;
                         }
                         if (!flag)
                                 off -= go_back;
@@@ -303,6 -316,7 +316,7 @@@
         }
   
         bytes_not_copied = copy_to_user(buffer, buf, off);
+       kfree(lfs);
         kfree(buf);
   
         if (bytes_not_copied)
@@@ -319,7 -333,6 +333,6 @@@ static int rvu_dbg_rvu_pf_cgx_map_displ
         struct rvu *rvu = filp->private;
         struct pci_dev *pdev = NULL;
         struct mac_ops *mac_ops;
-       int rvu_def_cgx_id = 0;
         char cgx[10], lmac[10];
         struct rvu_pfvf *pfvf;
         int pf, domain, blkid;
@@@ -327,7 -340,10 +340,10 @@@
         u16 pcifunc;
   
         domain = 2;
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
+       /* There can be no CGX devices at all */
+       if (!mac_ops)
+               return 0;
         seq_printf(filp, "PCI dev\t\tRVU PF Func\tNIX block\t%s\tLMAC\n",
                    mac_ops->name);
         for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
@@@ -1818,7 -1834,6 +1834,6 @@@ static void rvu_dbg_cgx_init(struct rv
   {
         struct mac_ops *mac_ops;
         unsigned long lmac_bmap;
-       int rvu_def_cgx_id = 0;
         int i, lmac_id;
         char dname[20];
         void *cgx;
@@@ -1826,7 -1841,7 +1841,7 @@@
         if (!cgx_get_cgxcnt_max())
                 return;
   
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
         if (!mac_ops)
                 return;
   
@@@ -2002,7 -2017,7 +2017,7 @@@ static void rvu_dbg_npc_mcam_show_flows
                         seq_printf(s, "mask 0x%x\n", ntohs(rule->mask.etype));
                         break;
                 case NPC_OUTER_VID:
- -                      seq_printf(s, "%d ", ntohs(rule->packet.vlan_tci));
+ +                      seq_printf(s, "0x%x ", ntohs(rule->packet.vlan_tci));
                         seq_printf(s, "mask 0x%x\n",
                                    ntohs(rule->mask.vlan_tci));
                         break;
@@@ -2145,7 -2160,7 +2160,7 @@@ static int rvu_dbg_npc_mcam_show_rules(
                 seq_printf(s, "\tmcam entry: %d\n", iter->entry);
   
                 rvu_dbg_npc_mcam_show_flows(s, iter);
- -              if (iter->intf == NIX_INTF_RX) {
+ +              if (is_npc_intf_rx(iter->intf)) {
                         target = iter->rx_action.pf_func;
                         pf = (target >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
                         seq_printf(s, "\tForward to: PF%d ", pf);
diff --combined drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c

index a871041,3d068b7..0a8bd66
--- 1/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
--- 2/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
@@@ -273,8 -273,7 +273,8 @@@ static int nix_interface_init(struct rv
                 pfvf->rx_chan_cnt = 1;
                 pfvf->tx_chan_cnt = 1;
                 rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
- -                                            pfvf->rx_chan_base, false);
+ +                                            pfvf->rx_chan_base,
+ +                                            pfvf->rx_chan_cnt, false);
                 break;
         }
   
@@@ -2630,7 -2629,7 +2630,7 @@@ static int set_flowkey_fields(struct ni
         struct nix_rx_flowkey_alg *field;
         struct nix_rx_flowkey_alg tmp;
         u32 key_type, valid_key;
-       int l4_key_offset;
+       int l4_key_offset = 0;
   
         if (!alg)
                 return -EINVAL;
@@@ -3089,8 -3088,7 +3089,8 @@@ int rvu_mbox_handler_nix_set_rx_mode(st
                 rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
         else
                 rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
- -                                            pfvf->rx_chan_base, allmulti);
+ +                                            pfvf->rx_chan_base,
+ +                                            pfvf->rx_chan_cnt, allmulti);
         return 0;
   }
   
@@@ -3637,7 -3635,9 +3637,7 @@@ int rvu_mbox_handler_nix_lf_stop_rx(str
         if (err)
                 return err;
   
- -      rvu_npc_disable_default_entries(rvu, pcifunc, nixlf);
- -
- -      npc_mcam_disable_flows(rvu, pcifunc);
+ +      rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
   
         return rvu_cgx_start_stop_io(rvu, pcifunc, false);
   }
diff --combined drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c

index 16d7797,0bd49c7..0bc4529
--- 1/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
--- 2/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
+++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
@@@ -22,6 -22,10 +22,6 @@@
   #define RSVD_MCAM_ENTRIES_PER_PF      2 /* Bcast & Promisc */
   #define RSVD_MCAM_ENTRIES_PER_NIXLF   1 /* Ucast for LFs */
   
- -#define NIXLF_UCAST_ENTRY     0
- -#define NIXLF_BCAST_ENTRY     1
- -#define NIXLF_PROMISC_ENTRY   2
- -
   #define NPC_PARSE_RESULT_DMAC_OFFSET  8
   #define NPC_HW_TSTAMP_OFFSET          8
   #define NPC_KEX_CHAN_MASK             0xFFFULL
@@@ -92,10 -96,6 +92,10 @@@ int npc_mcam_verify_channel(struct rvu 
         if (is_npc_intf_tx(intf))
                 return 0;
   
+ +      /* return in case of AF installed rules */
+ +      if (is_pffunc_af(pcifunc))
+ +              return 0;
+ +
         if (is_afvf(pcifunc)) {
                 end = rvu_get_num_lbk_chans();
                 if (end < 0)
@@@ -196,8 -196,8 +196,8 @@@ static int npc_get_ucast_mcam_index(str
         return mcam->nixlf_offset + (max + nixlf) * RSVD_MCAM_ENTRIES_PER_NIXLF;
   }
   
- -static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
- -                                  u16 pcifunc, int nixlf, int type)
+ +int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
+ +                           u16 pcifunc, int nixlf, int type)
   {
         int pf = rvu_get_pf(pcifunc);
         int index;
@@@ -230,8 -230,8 +230,8 @@@ int npc_get_bank(struct npc_mcam *mcam
         return bank;
   }
   
- -static bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
- -                                int blkaddr, int index)
+ +bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
+ +                         int blkaddr, int index)
   {
         int bank = npc_get_bank(mcam, index);
         u64 cfg;
@@@ -647,17 -647,13 +647,17 @@@ void rvu_npc_install_ucast_entry(struc
   }
   
   void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
- -                                 int nixlf, u64 chan, bool allmulti)
+ +                                 int nixlf, u64 chan, u8 chan_cnt,
+ +                                 bool allmulti)
   {
         struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
+ +      struct npc_install_flow_req req = { 0 };
+ +      struct npc_install_flow_rsp rsp = { 0 };
         struct npc_mcam *mcam = &rvu->hw->mcam;
- -      int blkaddr, ucast_idx, index, kwi;
- -      struct mcam_entry entry = { {0} };
- -      struct nix_rx_action action = { };
+ +      int blkaddr, ucast_idx, index;
+ +      u8 mac_addr[ETH_ALEN] = { 0 };
+ +      struct nix_rx_action action;
+ +      u64 relaxed_mask;
   
         /* Only PF or AF VF can add a promiscuous entry */
         if ((pcifunc & RVU_PFVF_FUNC_MASK) && !is_afvf(pcifunc))
@@@ -667,15 -663,24 +667,15 @@@
         if (blkaddr < 0)
                 return;
   
+ +      *(u64 *)&action = 0x00;
         index = npc_get_nixlf_mcam_index(mcam, pcifunc,
                                          nixlf, NIXLF_PROMISC_ENTRY);
   
- -      entry.kw[0] = chan;
- -      entry.kw_mask[0] = 0xFFFULL;
- -
- -      if (allmulti) {
- -              kwi = NPC_KEXOF_DMAC / sizeof(u64);
- -              entry.kw[kwi] = BIT_ULL(40); /* LSB bit of 1st byte in DMAC */
- -              entry.kw_mask[kwi] = BIT_ULL(40);
- -      }
- -
- -      ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
- -                                           nixlf, NIXLF_UCAST_ENTRY);
- -
         /* If the corresponding PF's ucast action is RSS,
          * use the same action for promisc also
          */
+ +      ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
+ +                                           nixlf, NIXLF_UCAST_ENTRY);
         if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx))
                 *(u64 *)&action = npc_get_mcam_action(rvu, mcam,
                                                         blkaddr, ucast_idx);
@@@ -686,36 -691,9 +686,36 @@@
                 action.pf_func = pcifunc;
         }
   
- -      entry.action = *(u64 *)&action;
- -      npc_config_mcam_entry(rvu, mcam, blkaddr, index,
- -                            pfvf->nix_rx_intf, &entry, true);
+ +      if (allmulti) {
+ +              mac_addr[0] = 0x01;     /* LSB bit of 1st byte in DMAC */
+ +              ether_addr_copy(req.packet.dmac, mac_addr);
+ +              ether_addr_copy(req.mask.dmac, mac_addr);
+ +              req.features = BIT_ULL(NPC_DMAC);
+ +      }
+ +
+ +      req.chan_mask = 0xFFFU;
+ +      if (chan_cnt > 1) {
+ +              if (!is_power_of_2(chan_cnt)) {
+ +                      dev_err(rvu->dev,
+ +                              "%s: channel count more than 1, must be power of 2\n", __func__);
+ +                      return;
+ +              }
+ +              relaxed_mask = GENMASK_ULL(BITS_PER_LONG_LONG - 1,
+ +                                         ilog2(chan_cnt));
+ +              req.chan_mask &= relaxed_mask;
+ +      }
+ +
+ +      req.channel = chan;
+ +      req.intf = pfvf->nix_rx_intf;
+ +      req.entry = index;
+ +      req.op = action.op;
+ +      req.hdr.pcifunc = 0; /* AF is requester */
+ +      req.vf = pcifunc;
+ +      req.index = action.index;
+ +      req.match_id = action.match_id;
+ +      req.flow_key_alg = action.flow_key_alg;
+ +
+ +      rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
   }
   
   static void npc_enadis_promisc_entry(struct rvu *rvu, u16 pcifunc,
@@@ -750,14 -728,12 +750,14 @@@ void rvu_npc_enable_promisc_entry(struc
   void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
                                        int nixlf, u64 chan)
   {
+ +      struct rvu_pfvf *pfvf;
+ +      struct npc_install_flow_req req = { 0 };
+ +      struct npc_install_flow_rsp rsp = { 0 };
         struct npc_mcam *mcam = &rvu->hw->mcam;
- -      struct mcam_entry entry = { {0} };
         struct rvu_hwinfo *hw = rvu->hw;
- -      struct nix_rx_action action;
- -      struct rvu_pfvf *pfvf;
         int blkaddr, index;
+ +      u32 req_index = 0;
+ +      u8 op;
   
         blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
         if (blkaddr < 0)
@@@ -779,29 -755,32 +779,29 @@@
         index = npc_get_nixlf_mcam_index(mcam, pcifunc,
                                          nixlf, NIXLF_BCAST_ENTRY);
   
- -      /* Match ingress channel */
- -      entry.kw[0] = chan;
- -      entry.kw_mask[0] = 0xfffull;
- -
- -      /* Match broadcast MAC address.
- -       * DMAC is extracted at 0th bit of PARSE_KEX::KW1
- -       */
- -      entry.kw[1] = 0xffffffffffffull;
- -      entry.kw_mask[1] = 0xffffffffffffull;
- -
- -      *(u64 *)&action = 0x00;
         if (!hw->cap.nix_rx_multicast) {
                 /* Early silicon doesn't support pkt replication,
                  * so install entry with UCAST action, so that PF
                  * receives all broadcast packets.
                  */
- -              action.op = NIX_RX_ACTIONOP_UCAST;
- -              action.pf_func = pcifunc;
+ +              op = NIX_RX_ACTIONOP_UCAST;
         } else {
- -              action.index = pfvf->bcast_mce_idx;
- -              action.op = NIX_RX_ACTIONOP_MCAST;
+ +              op = NIX_RX_ACTIONOP_MCAST;
+ +              req_index = pfvf->bcast_mce_idx;
         }
   
- -      entry.action = *(u64 *)&action;
- -      npc_config_mcam_entry(rvu, mcam, blkaddr, index,
- -                            pfvf->nix_rx_intf, &entry, true);
+ +      eth_broadcast_addr((u8 *)&req.packet.dmac);
+ +      eth_broadcast_addr((u8 *)&req.mask.dmac);
+ +      req.features = BIT_ULL(NPC_DMAC);
+ +      req.channel = chan;
+ +      req.intf = pfvf->nix_rx_intf;
+ +      req.entry = index;
+ +      req.op = op;
+ +      req.hdr.pcifunc = 0; /* AF is requester */
+ +      req.vf = pcifunc;
+ +      req.index = req_index;
+ +
+ +      rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
   }
   
   void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, bool enable)
@@@ -988,7 -967,7 +988,7 @@@ void rvu_npc_disable_mcam_entries(struc
   {
         struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
         struct npc_mcam *mcam = &rvu->hw->mcam;
- -      struct rvu_npc_mcam_rule *rule;
+ +      struct rvu_npc_mcam_rule *rule, *tmp;
         int blkaddr;
   
         blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
@@@ -998,18 -977,15 +998,18 @@@
         mutex_lock(&mcam->lock);
   
         /* Disable MCAM entries directing traffic to this 'pcifunc' */
- -      list_for_each_entry(rule, &mcam->mcam_rules, list) {
+ +      list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) {
                 if (is_npc_intf_rx(rule->intf) &&
                     rule->rx_action.pf_func == pcifunc) {
                         npc_enable_mcam_entry(rvu, mcam, blkaddr,
                                               rule->entry, false);
                         rule->enable = false;
                         /* Indicate that default rule is disabled */
- -                      if (rule->default_rule)
+ +                      if (rule->default_rule) {
                                 pfvf->def_ucast_rule = NULL;
+ +                              list_del(&rule->list);
+ +                              kfree(rule);
+ +                      }
                 }
         }
   
@@@ -1698,9 -1674,6 +1698,9 @@@ void rvu_npc_get_mcam_counter_alloc_inf
   static int npc_mcam_verify_entry(struct npc_mcam *mcam,
                                  u16 pcifunc, int entry)
   {
+ +      /* verify AF installed entries */
+ +      if (is_pffunc_af(pcifunc))
+ +              return 0;
         /* Verify if entry is valid and if it is indeed
          * allocated to the requesting PFFUNC.
          */
@@@ -2295,10 -2268,6 +2295,10 @@@ int rvu_mbox_handler_npc_mcam_write_ent
                 goto exit;
         }
   
+ +      /* For AF installed rules, the nix_intf should be set to target NIX */
+ +      if (is_pffunc_af(req->hdr.pcifunc))
+ +              nix_intf = req->intf;
+ +
         npc_config_mcam_entry(rvu, mcam, blkaddr, req->entry, nix_intf,
                               &req->entry_data, req->enable_entry);
   
@@@ -2521,10 -2490,10 +2521,10 @@@ int rvu_mbox_handler_npc_mcam_free_coun
                 index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry);
                 if (index >= mcam->bmap_entries)
                         break;
+               entry = index + 1;
                 if (mcam->entry2cntr_map[index] != req->cntr)
                         continue;
   
-               entry = index + 1;
                 npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr,
                                               index, req->cntr);
         }
@@@ -2761,6 -2730,30 +2761,6 @@@ int rvu_mbox_handler_npc_get_kex_cfg(st
         return 0;
   }
   
- -bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
- -                              u16 pcifunc, u8 intf, struct mcam_entry *entry,
- -                              int *index)
- -{
- -      struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
- -      struct npc_mcam *mcam = &rvu->hw->mcam;
- -      bool enable;
- -      u8 nix_intf;
- -
- -      if (is_npc_intf_tx(intf))
- -              nix_intf = pfvf->nix_tx_intf;
- -      else
- -              nix_intf = pfvf->nix_rx_intf;
- -
- -      *index = npc_get_nixlf_mcam_index(mcam, pcifunc,
- -                                        nixlf, NIXLF_UCAST_ENTRY);
- -      /* dont force enable unicast entry  */
- -      enable = is_mcam_entry_enabled(rvu, mcam, blkaddr, *index);
- -      npc_config_mcam_entry(rvu, mcam, blkaddr, *index, nix_intf,
- -                            entry, enable);
- -
- -      return enable;
- -}
- -
   int rvu_mbox_handler_npc_read_base_steer_rule(struct rvu *rvu,
                                               struct msg_req *req,
                                               struct npc_mcam_read_base_rule_rsp *rsp)
@@@ -2806,42 -2799,3 +2806,42 @@@ read_entry
   out:
         return rc;
   }
+ +
+ +int rvu_mbox_handler_npc_mcam_entry_stats(struct rvu *rvu,
+ +                                        struct npc_mcam_get_stats_req *req,
+ +                                        struct npc_mcam_get_stats_rsp *rsp)
+ +{
+ +      struct npc_mcam *mcam = &rvu->hw->mcam;
+ +      u16 index, cntr;
+ +      int blkaddr;
+ +      u64 regval;
+ +      u32 bank;
+ +
+ +      blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
+ +      if (blkaddr < 0)
+ +              return NPC_MCAM_INVALID_REQ;
+ +
+ +      mutex_lock(&mcam->lock);
+ +
+ +      index = req->entry & (mcam->banksize - 1);
+ +      bank = npc_get_bank(mcam, req->entry);
+ +
+ +      /* read MCAM entry STAT_ACT register */
+ +      regval = rvu_read64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_STAT_ACT(index, bank));
+ +
+ +      if (!(regval & BIT_ULL(9))) {
+ +              rsp->stat_ena = 0;
+ +              mutex_unlock(&mcam->lock);
+ +              return 0;
+ +      }
+ +
+ +      cntr = regval & 0x1FF;
+ +
+ +      rsp->stat_ena = 1;
+ +      rsp->stat = rvu_read64(rvu, blkaddr, NPC_AF_MATCH_STATX(cntr));
+ +      rsp->stat &= BIT_ULL(48) - 1;
+ +
+ +      mutex_unlock(&mcam->lock);
+ +
+ +      return 0;
+ +}
diff --combined drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c

index fa7a46a,dc17784..0b4fa92
--- 1/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
--- 2/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
@@@ -57,13 -57,10 +57,13 @@@ int otx2_alloc_mcam_entries(struct otx2
                 flow_cfg->ntuple_max_flows = rsp->count;
                 flow_cfg->ntuple_offset = 0;
                 pfvf->flags |= OTX2_FLAG_NTUPLE_SUPPORT;
+ +              flow_cfg->tc_max_flows = flow_cfg->ntuple_max_flows;
+ +              pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
         } else {
                 flow_cfg->vf_vlan_offset = 0;
                 flow_cfg->ntuple_offset = flow_cfg->vf_vlan_offset +
                                                 vf_vlan_max_flows;
+ +              flow_cfg->tc_flower_offset = flow_cfg->ntuple_offset;
                 flow_cfg->unicast_offset = flow_cfg->ntuple_offset +
                                                 OTX2_MAX_NTUPLE_FLOWS;
                 flow_cfg->rx_vlan_offset = flow_cfg->unicast_offset +
@@@ -72,7 -69,6 +72,7 @@@
                 pfvf->flags |= OTX2_FLAG_UCAST_FLTR_SUPPORT;
                 pfvf->flags |= OTX2_FLAG_RX_VLAN_SUPPORT;
                 pfvf->flags |= OTX2_FLAG_VF_VLAN_SUPPORT;
+ +              pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
         }
   
         for (i = 0; i < rsp->count; i++)
@@@ -97,7 -93,6 +97,7 @@@ int otx2_mcam_flow_init(struct otx2_ni
         INIT_LIST_HEAD(&pf->flow_cfg->flow_list);
   
         pf->flow_cfg->ntuple_max_flows = OTX2_MAX_NTUPLE_FLOWS;
+ +      pf->flow_cfg->tc_max_flows = pf->flow_cfg->ntuple_max_flows;
   
         err = otx2_alloc_mcam_entries(pf);
         if (err)
@@@ -262,17 -257,19 +262,19 @@@ int otx2_get_flow(struct otx2_nic *pfvf
   int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
                        u32 *rule_locs)
   {
+       u32 rule_cnt = nfc->rule_cnt;
         u32 location = 0;
         int idx = 0;
         int err = 0;
   
         nfc->data = pfvf->flow_cfg->ntuple_max_flows;
-       while ((!err || err == -ENOENT) && idx < nfc->rule_cnt) {
+       while ((!err || err == -ENOENT) && idx < rule_cnt) {
                 err = otx2_get_flow(pfvf, nfc, location);
                 if (!err)
                         rule_locs[idx++] = location;
                 location++;
         }
+       nfc->rule_cnt = rule_cnt;
   
         return err;
   }
@@@ -306,35 -303,6 +308,35 @@@ static int otx2_prepare_ipv4_flow(struc
                                sizeof(pmask->ip4dst));
                         req->features |= BIT_ULL(NPC_DIP_IPV4);
                 }
+ +              if (ipv4_usr_mask->tos) {
+ +                      pkt->tos = ipv4_usr_hdr->tos;
+ +                      pmask->tos = ipv4_usr_mask->tos;
+ +                      req->features |= BIT_ULL(NPC_TOS);
+ +              }
+ +              if (ipv4_usr_mask->proto) {
+ +                      switch (ipv4_usr_hdr->proto) {
+ +                      case IPPROTO_ICMP:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_ICMP);
+ +                              break;
+ +                      case IPPROTO_TCP:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_TCP);
+ +                              break;
+ +                      case IPPROTO_UDP:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_UDP);
+ +                              break;
+ +                      case IPPROTO_SCTP:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_SCTP);
+ +                              break;
+ +                      case IPPROTO_AH:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_AH);
+ +                              break;
+ +                      case IPPROTO_ESP:
+ +                              req->features |= BIT_ULL(NPC_IPPROTO_ESP);
+ +                              break;
+ +                      default:
+ +                              return -EOPNOTSUPP;
+ +                      }
+ +              }
                 pkt->etype = cpu_to_be16(ETH_P_IP);
                 pmask->etype = cpu_to_be16(0xFFFF);
                 req->features |= BIT_ULL(NPC_ETYPE);
@@@ -359,11 -327,6 +361,11 @@@
                                sizeof(pmask->ip4dst));
                         req->features |= BIT_ULL(NPC_DIP_IPV4);
                 }
+ +              if (ipv4_l4_mask->tos) {
+ +                      pkt->tos = ipv4_l4_hdr->tos;
+ +                      pmask->tos = ipv4_l4_mask->tos;
+ +                      req->features |= BIT_ULL(NPC_TOS);
+ +              }
                 if (ipv4_l4_mask->psrc) {
                         memcpy(&pkt->sport, &ipv4_l4_hdr->psrc,
                                sizeof(pkt->sport));
@@@ -414,14 -377,10 +416,14 @@@
                                sizeof(pmask->ip4dst));
                         req->features |= BIT_ULL(NPC_DIP_IPV4);
                 }
+ +              if (ah_esp_mask->tos) {
+ +                      pkt->tos = ah_esp_hdr->tos;
+ +                      pmask->tos = ah_esp_mask->tos;
+ +                      req->features |= BIT_ULL(NPC_TOS);
+ +              }
   
                 /* NPC profile doesn't extract AH/ESP header fields */
- -              if ((ah_esp_mask->spi & ah_esp_hdr->spi) ||
- -                  (ah_esp_mask->tos & ah_esp_mask->tos))
+ +              if (ah_esp_mask->spi & ah_esp_hdr->spi)
                         return -EOPNOTSUPP;
   
                 if (flow_type == AH_V4_FLOW)
diff --combined drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c

index 772a29b,2fd3d23..03004fd
--- 1/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
--- 2/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
@@@ -1672,6 -1672,7 +1672,7 @@@ int otx2_stop(struct net_device *netdev
         struct otx2_nic *pf = netdev_priv(netdev);
         struct otx2_cq_poll *cq_poll = NULL;
         struct otx2_qset *qset = &pf->qset;
+       struct otx2_rss_info *rss;
         int qidx, vec, wrk;
   
         netif_carrier_off(netdev);
@@@ -1684,6 -1685,10 +1685,10 @@@
         /* First stop packet Rx/Tx */
         otx2_rxtx_enable(pf, false);
   
+       /* Clear RSS enable flag */
+       rss = &pf->hw.rss_info;
+       rss->enable = false;
+ 
         /* Cleanup Queue IRQ */
         vec = pci_irq_vector(pf->pdev,
                              pf->hw.nix_msixoff + NIX_LF_QINT_VEC_START);
@@@ -1760,24 -1765,6 +1765,24 @@@ static netdev_tx_t otx2_xmit(struct sk_
         return NETDEV_TX_OK;
   }
   
+ +static netdev_features_t otx2_fix_features(struct net_device *dev,
+ +                                         netdev_features_t features)
+ +{
+ +      /* check if n-tuple filters are ON */
+ +      if ((features & NETIF_F_HW_TC) && (dev->features & NETIF_F_NTUPLE)) {
+ +              netdev_info(dev, "Disabling n-tuple filters\n");
+ +              features &= ~NETIF_F_NTUPLE;
+ +      }
+ +
+ +      /* check if tc hw offload is ON */
+ +      if ((features & NETIF_F_NTUPLE) && (dev->features & NETIF_F_HW_TC)) {
+ +              netdev_info(dev, "Disabling TC hardware offload\n");
+ +              features &= ~NETIF_F_HW_TC;
+ +      }
+ +
+ +      return features;
+ +}
+ +
   static void otx2_set_rx_mode(struct net_device *netdev)
   {
         struct otx2_nic *pf = netdev_priv(netdev);
@@@ -1840,12 -1827,6 +1845,12 @@@ static int otx2_set_features(struct net
         if ((changed & NETIF_F_NTUPLE) && !ntuple)
                 otx2_destroy_ntuple_flows(pf);
   
+ +      if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
+ +          pf->tc_info.num_entries) {
+ +              netdev_err(netdev, "Can't disable TC hardware offload while flows are active\n");
+ +              return -EBUSY;
+ +      }
+ +
         return 0;
   }
   
@@@ -2244,7 -2225,6 +2249,7 @@@ static const struct net_device_ops otx2
         .ndo_open               = otx2_open,
         .ndo_stop               = otx2_stop,
         .ndo_start_xmit         = otx2_xmit,
+ +      .ndo_fix_features       = otx2_fix_features,
         .ndo_set_mac_address    = otx2_set_mac_address,
         .ndo_change_mtu         = otx2_change_mtu,
         .ndo_set_rx_mode        = otx2_set_rx_mode,
@@@ -2255,7 -2235,6 +2260,7 @@@
         .ndo_set_vf_mac         = otx2_set_vf_mac,
         .ndo_set_vf_vlan        = otx2_set_vf_vlan,
         .ndo_get_vf_config      = otx2_get_vf_config,
+ +      .ndo_setup_tc           = otx2_setup_tc,
   };
   
   static int otx2_wq_init(struct otx2_nic *pf)
@@@ -2475,10 -2454,6 +2480,10 @@@ static int otx2_probe(struct pci_dev *p
                                        NETIF_F_HW_VLAN_STAG_RX;
         netdev->features |= netdev->hw_features;
   
+ +      /* HW supports tc offload but mutually exclusive with n-tuple filters */
+ +      if (pf->flags & OTX2_FLAG_TC_FLOWER_SUPPORT)
+ +              netdev->hw_features |= NETIF_F_HW_TC;
+ +
         netdev->gso_max_segs = OTX2_MAX_GSO_SEGS;
         netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
   
@@@ -2500,10 -2475,6 +2505,10 @@@
   
         otx2_set_ethtool_ops(netdev);
   
+ +      err = otx2_init_tc(pf);
+ +      if (err)
+ +              goto err_mcam_flow_del;
+ +
         /* Enable link notifications */
         otx2_cgx_config_linkevents(pf, true);
   
@@@ -2513,8 -2484,6 +2518,8 @@@
   
         return 0;
   
+ +err_mcam_flow_del:
+ +      otx2_mcam_flow_del(pf);
   err_unreg_netdev:
         unregister_netdev(netdev);
   err_del_mcam_entries:
@@@ -2682,7 -2651,6 +2687,7 @@@ static void otx2_remove(struct pci_dev 
   
         otx2_ptp_destroy(pf);
         otx2_mcam_flow_del(pf);
+ +      otx2_shutdown_tc(pf);
         otx2_detach_resources(&pf->mbox);
         if (pf->hw.lmt_base)
                 iounmap(pf->hw.lmt_base);
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en.h

index 1f5bc4d,304b296..9ea3f3b
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en.h
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@@ -92,14 -92,15 +92,15 @@@ struct page_pool
                                     MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
   #define MLX5_MPWRQ_PAGES_PER_WQE              BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
   
- #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+ #define MLX5_ALIGN_MTTS(mtts)         (ALIGN(mtts, 8))
+ #define MLX5_ALIGNED_MTTS_OCTW(mtts)  ((mtts) / 2)
+ #define MLX5_MTT_OCTW(mtts)           (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts)))
   /* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between
    * WQEs, This page will absorb write overflow by the hardware, when
    * receiving packets larger than MTU. These oversize packets are
    * dropped by the driver at a later stage.
    */
- #define MLX5E_REQUIRED_WQE_MTTS               (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8))
- #define MLX5E_LOG_ALIGNED_MPWQE_PPW   (ilog2(MLX5E_REQUIRED_WQE_MTTS))
+ #define MLX5E_REQUIRED_WQE_MTTS               (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1))
   #define MLX5E_REQUIRED_MTTS(wqes)     (wqes * MLX5E_REQUIRED_WQE_MTTS)
   #define MLX5E_MAX_RQ_NUM_MTTS \
         ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
@@@ -880,6 -881,7 +881,6 @@@ struct mlx5e_priv 
   #endif
         struct devlink_health_reporter *tx_reporter;
         struct devlink_health_reporter *rx_reporter;
- -      struct devlink_port            dl_port;
         struct mlx5e_xsk           xsk;
   #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
         struct mlx5e_hv_vhca_stats_agent stats_agent;
@@@ -1173,7 -1175,6 +1174,7 @@@ void mlx5e_detach_netdev(struct mlx5e_p
   void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
   int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
                                 const struct mlx5e_profile *new_profile, void *new_ppriv);
+ +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv);
   void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
   void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu);
   void mlx5e_build_rq_params(struct mlx5_core_dev *mdev,
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c

index 5e3d31b,b2cd298..df13e50
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
@@@ -695,7 -695,7 +695,7 @@@ mlx5_tc_ct_entry_add_rule(struct mlx5_t
   
         zone_rule->nat = nat;
   
- -      spec = kzalloc(sizeof(*spec), GFP_KERNEL);
+ +      spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
         if (!spec)
                 return -ENOMEM;
   
@@@ -737,7 -737,7 +737,7 @@@
   
         zone_rule->attr = attr;
   
- -      kfree(spec);
+ +      kvfree(spec);
         ct_dbg("Offloaded ct entry rule in zone %d", entry->tuple.zone);
   
         return 0;
@@@ -749,7 -749,7 +749,7 @@@ err_rule
   err_mod_hdr:
         kfree(attr);
   err_attr:
- -      kfree(spec);
+ +      kvfree(spec);
         return err;
   }
   
@@@ -1181,7 -1181,8 +1181,8 @@@ int mlx5_tc_ct_add_no_trk_match(struct 
   
         mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG,
                                         &ctstate, &ctstate_mask);
-       if (ctstate_mask)
+ 
+       if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT)
                 return -EOPNOTSUPP;
   
         ctstate_mask |= MLX5_CT_STATE_TRK_BIT;
@@@ -1539,14 -1540,6 +1540,14 @@@ mlx5_tc_ct_free_pre_ct_tables(struct ml
         mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct);
   }
   
+ +/* To avoid false lock dependency warning set the ct_entries_ht lock
+ + * class different than the lock class of the ht being used when deleting
+ + * last flow from a group and then deleting a group, we get into del_sw_flow_group()
+ + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but
+ + * it's different than the ht->mutex here.
+ + */
+ +static struct lock_class_key ct_entries_ht_lock_key;
+ +
   static struct mlx5_ct_ft *
   mlx5_tc_ct_add_ft_cb(struct mlx5_tc_ct_priv *ct_priv, u16 zone,
                      struct nf_flowtable *nf_ft)
@@@ -1581,8 -1574,6 +1582,8 @@@
         if (err)
                 goto err_init;
   
+ +      lockdep_set_class(&ft->ct_entries_ht.mutex, &ct_entries_ht_lock_key);
+ +
         err = rhashtable_insert_fast(&ct_priv->zone_ht, &ft->node,
                                      zone_params);
         if (err)
@@@ -1684,10 -1675,10 +1685,10 @@@ __mlx5_tc_ct_flow_offload(struct mlx5_t
         struct mlx5_ct_ft *ft;
         u32 fte_id = 1;
   
- -      post_ct_spec = kzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
+ +      post_ct_spec = kvzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
         ct_flow = kzalloc(sizeof(*ct_flow), GFP_KERNEL);
         if (!post_ct_spec || !ct_flow) {
- -              kfree(post_ct_spec);
+ +              kvfree(post_ct_spec);
                 kfree(ct_flow);
                 return ERR_PTR(-ENOMEM);
         }
@@@ -1797,10 -1788,6 +1798,10 @@@
         ct_flow->post_ct_attr->prio = 0;
         ct_flow->post_ct_attr->ft = ct_priv->post_ct;
   
+ +      /* Splits were handled before CT */
+ +      if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
+ +              ct_flow->post_ct_attr->esw_attr->split_count = 0;
+ +
         ct_flow->post_ct_attr->inner_match_level = MLX5_MATCH_NONE;
         ct_flow->post_ct_attr->outer_match_level = MLX5_MATCH_NONE;
         ct_flow->post_ct_attr->action &= ~(MLX5_FLOW_CONTEXT_ACTION_DECAP);
@@@ -1826,7 -1813,7 +1827,7 @@@
   
         attr->ct_attr.ct_flow = ct_flow;
         dealloc_mod_hdr_actions(&pre_mod_acts);
- -      kfree(post_ct_spec);
+ +      kvfree(post_ct_spec);
   
         return rule;
   
@@@ -1847,7 -1834,7 +1848,7 @@@ err_alloc_pre
   err_idr:
         mlx5_tc_ct_del_ft_cb(ct_priv, ft);
   err_ft:
- -      kfree(post_ct_spec);
+ +      kvfree(post_ct_spec);
         kfree(ct_flow);
         netdev_warn(priv->netdev, "Failed to offload ct flow, err %d\n", err);
         return ERR_PTR(err);
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c

index 32d06fe,7f7b0f6..01d435e
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
@@@ -2,7 -2,6 +2,7 @@@
   /* Copyright (c) 2021 Mellanox Technologies. */
   
   #include <net/fib_notifier.h>
+ +#include <net/nexthop.h>
   #include "tc_tun_encap.h"
   #include "en_tc.h"
   #include "tc_tun.h"
@@@ -90,6 -89,7 +90,7 @@@ int mlx5e_tc_set_attr_rx_tun(struct mlx
          * required to establish routing.
          */
         flow_flag_set(flow, TUN_RX);
+       flow->attr->tun_ip_version = ip_version;
         return 0;
   }
   
@@@ -1092,7 -1092,7 +1093,7 @@@ int mlx5e_attach_decap_route(struct mlx
         if (err || !esw_attr->rx_tun_attr->decap_vport)
                 goto out;
   
-       key.ip_version = attr->ip_version;
+       key.ip_version = attr->tun_ip_version;
         if (key.ip_version == 4)
                 key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
         else
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_main.c

index 9c08f0b,158f947..d40fc26
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@@ -302,7 -302,7 +302,7 @@@ static int mlx5e_create_umr_mkey(struc
         MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
         mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
         MLX5_SET(mkc, mkc, qpn, 0xffffff);
- -      MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
+ +      MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
         MLX5_SET64(mkc, mkc, len, npages << page_shift);
         MLX5_SET(mkc, mkc, translations_octword_size,
                  MLX5_MTT_OCTW(npages));
@@@ -334,9 -334,9 +334,9 @@@ static int mlx5e_create_rq_umr_mkey(str
                                      rq->wqe_overflow.addr);
   }
   
- static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+ static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix)
   {
-       return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+       return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT;
   }
   
   static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
@@@ -577,7 -577,7 +577,7 @@@ static int mlx5e_alloc_rq(struct mlx5e_
                                 mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
                         u32 byte_count =
                                 rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-                       u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+                       u64 dma_offset = mlx5e_get_mpwqe_offset(i);
   
                         wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
                         wqe->data[0].byte_count = cpu_to_be32(byte_count);
@@@ -1019,7 -1019,7 +1019,7 @@@ static int mlx5e_alloc_xdpsq(struct mlx
         sq->pdev      = c->pdev;
         sq->mkey_be   = c->mkey_be;
         sq->channel   = c;
- -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+ +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
         sq->min_inline_mode = params->tx_min_inline_mode;
         sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
         sq->xsk_pool  = xsk_pool;
@@@ -1090,7 -1090,7 +1090,7 @@@ static int mlx5e_alloc_icosq(struct mlx
         int err;
   
         sq->channel   = c;
- -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+ +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
   
         param->wq.db_numa_node = cpu_to_node(c->cpu);
         err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@@ -1174,7 -1174,7 +1174,7 @@@ static int mlx5e_alloc_txqsq(struct mlx
         sq->priv      = c->priv;
         sq->ch_ix     = c->ix;
         sq->txq_ix    = txq_ix;
- -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+ +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
         sq->min_inline_mode = params->tx_min_inline_mode;
         sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
         INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
@@@ -1257,7 -1257,7 +1257,7 @@@ static int mlx5e_create_sq(struct mlx5_
         MLX5_SET(sqc,  sqc, flush_in_error_en, 1);
   
         MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
- -      MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
+ +      MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.hw_objs.bfreg.index);
         MLX5_SET(wq,   wq, log_wq_pg_sz,  csp->wq_ctrl->buf.page_shift -
                                           MLX5_ADAPTER_PAGE_SHIFT);
         MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
@@@ -2032,7 -2032,7 +2032,7 @@@ static int mlx5e_open_channel(struct ml
         c->cpu      = cpu;
         c->pdev     = mlx5_core_dma_dev(priv->mdev);
         c->netdev   = priv->netdev;
- -      c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+ +      c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
         c->num_tc   = params->num_tc;
         c->xdp      = !!params->xdp_prog;
         c->stats    = &priv->channel_stats[ix].ch;
@@@ -2217,7 -2217,7 +2217,7 @@@ void mlx5e_build_rq_param(struct mlx5e_
         MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
         MLX5_SET(wq, wq, log_wq_stride,
                  mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
- -      MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
+ +      MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.hw_objs.pdn);
         MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
         MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
         MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
@@@ -2248,7 -2248,7 +2248,7 @@@ void mlx5e_build_sq_param_common(struc
         void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
   
         MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
- -      MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.pdn);
+ +      MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.hw_objs.pdn);
   
         param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
   }
@@@ -2368,8 -2368,9 +2368,9 @@@ static u8 mlx5e_build_icosq_log_wq_sz(s
   {
         switch (params->rq_wq_type) {
         case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-               return order_base_2(MLX5E_UMR_WQEBBS) +
-                       mlx5e_get_rq_log_wq_sz(rqp->rqc);
+               return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE,
+                            order_base_2(MLX5E_UMR_WQEBBS) +
+                            mlx5e_get_rq_log_wq_sz(rqp->rqc));
         default: /* MLX5_WQ_TYPE_CYCLIC */
                 return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
         }
@@@ -2502,8 -2503,10 +2503,10 @@@ void mlx5e_close_channels(struct mlx5e_
   {
         int i;
   
-       if (chs->port_ptp)
+       if (chs->port_ptp) {
                 mlx5e_port_ptp_close(chs->port_ptp);
+               chs->port_ptp = NULL;
+       }
   
         for (i = 0; i < chs->num; i++)
                 mlx5e_close_channel(chs->c[i]);
@@@ -3421,10 -3424,10 +3424,10 @@@ int mlx5e_create_tis(struct mlx5_core_d
   {
         void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
   
- -      MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
+ +      MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
   
         if (MLX5_GET(tisc, tisc, tls_en))
- -              MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.pdn);
+ +              MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn);
   
         if (mlx5_lag_is_lacp_owner(mdev))
                 MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
@@@ -3494,7 -3497,7 +3497,7 @@@ static void mlx5e_cleanup_nic_tx(struc
   static void mlx5e_build_indir_tir_ctx_common(struct mlx5e_priv *priv,
                                              u32 rqtn, u32 *tirc)
   {
- -      MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
+ +      MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.hw_objs.td.tdn);
         MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
         MLX5_SET(tirc, tirc, indirect_table, rqtn);
         MLX5_SET(tirc, tirc, tunneled_offload_en,
@@@ -3769,16 -3772,8 +3772,16 @@@ static int mlx5e_setup_tc(struct net_de
                           void *type_data)
   {
         struct mlx5e_priv *priv = netdev_priv(dev);
+ +      bool tc_unbind = false;
         int err;
   
+ +      if (type == TC_SETUP_BLOCK &&
+ +          ((struct flow_block_offload *)type_data)->command == FLOW_BLOCK_UNBIND)
+ +              tc_unbind = true;
+ +
+ +      if (!netif_device_present(dev) && !tc_unbind)
+ +              return -ENODEV;
+ +
         switch (type) {
         case TC_SETUP_BLOCK: {
                 struct flow_block_offload *f = type_data;
@@@ -3818,6 -3813,15 +3821,15 @@@ void mlx5e_fold_sw_stats64(struct mlx5e
                 for (j = 0; j < priv->max_opened_tc; j++) {
                         struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
   
+                       s->tx_packets    += sq_stats->packets;
+                       s->tx_bytes      += sq_stats->bytes;
+                       s->tx_dropped    += sq_stats->dropped;
+               }
+       }
+       if (priv->port_ptp_opened) {
+               for (i = 0; i < priv->max_opened_tc; i++) {
+                       struct mlx5e_sq_stats *sq_stats = &priv->port_ptp_stats.sq[i];
+ 
                         s->tx_packets    += sq_stats->packets;
                         s->tx_bytes      += sq_stats->bytes;
                         s->tx_dropped    += sq_stats->dropped;
@@@ -3831,9 -3835,6 +3843,9 @@@ mlx5e_get_stats(struct net_device *dev
         struct mlx5e_priv *priv = netdev_priv(dev);
         struct mlx5e_pport_stats *pstats = &priv->stats.pport;
   
+ +      if (!netif_device_present(dev))
+ +              return;
+ +
         /* In switchdev mode, monitor counters doesn't monitor
          * rx/tx stats of 802_3. The update stats mechanism
          * should keep the 802_3 layout counters updated
@@@ -3845,10 -3846,17 +3857,17 @@@
         }
   
         if (mlx5e_is_uplink_rep(priv)) {
+               struct mlx5e_vport_stats *vstats = &priv->stats.vport;
+ 
                 stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
                 stats->rx_bytes   = PPORT_802_3_GET(pstats, a_octets_received_ok);
                 stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
                 stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+ 
+               /* vport multicast also counts packets that are dropped due to steering
+                * or rx out of buffer
+                */
+               stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
         } else {
                 mlx5e_fold_sw_stats64(priv, stats);
         }
@@@ -3868,19 -3876,11 +3887,19 @@@
         stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
   }
   
+ +static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
+ +{
+ +      if (mlx5e_is_uplink_rep(priv))
+ +              return; /* no rx mode for uplink rep */
+ +
+ +      queue_work(priv->wq, &priv->set_rx_mode_work);
+ +}
+ +
   static void mlx5e_set_rx_mode(struct net_device *dev)
   {
         struct mlx5e_priv *priv = netdev_priv(dev);
   
- -      queue_work(priv->wq, &priv->set_rx_mode_work);
+ +      mlx5e_nic_set_rx_mode(priv);
   }
   
   static int mlx5e_set_mac(struct net_device *netdev, void *addr)
@@@ -3895,7 -3895,7 +3914,7 @@@
         ether_addr_copy(netdev->dev_addr, saddr->sa_data);
         netif_addr_unlock_bh(netdev);
   
- -      queue_work(priv->wq, &priv->set_rx_mode_work);
+ +      mlx5e_nic_set_rx_mode(priv);
   
         return 0;
   }
@@@ -4433,9 -4433,6 +4452,9 @@@ static int mlx5e_set_vf_link_state(stru
         struct mlx5e_priv *priv = netdev_priv(dev);
         struct mlx5_core_dev *mdev = priv->mdev;
   
+ +      if (mlx5e_is_uplink_rep(priv))
+ +              return -EOPNOTSUPP;
+ +
         return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1,
                                             mlx5_ifla_link2vport(link_state));
   }
@@@ -4447,9 -4444,6 +4466,9 @@@ int mlx5e_get_vf_config(struct net_devi
         struct mlx5_core_dev *mdev = priv->mdev;
         int err;
   
+ +      if (!netif_device_present(dev))
+ +              return -EOPNOTSUPP;
+ +
         err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi);
         if (err)
                 return err;
@@@ -4466,32 -4460,6 +4485,32 @@@ int mlx5e_get_vf_stats(struct net_devic
         return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
                                             vf_stats);
   }
+ +
+ +static bool
+ +mlx5e_has_offload_stats(const struct net_device *dev, int attr_id)
+ +{
+ +      struct mlx5e_priv *priv = netdev_priv(dev);
+ +
+ +      if (!netif_device_present(dev))
+ +              return false;
+ +
+ +      if (!mlx5e_is_uplink_rep(priv))
+ +              return false;
+ +
+ +      return mlx5e_rep_has_offload_stats(dev, attr_id);
+ +}
+ +
+ +static int
+ +mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
+ +                      void *sp)
+ +{
+ +      struct mlx5e_priv *priv = netdev_priv(dev);
+ +
+ +      if (!mlx5e_is_uplink_rep(priv))
+ +              return -EOPNOTSUPP;
+ +
+ +      return mlx5e_rep_get_offload_stats(attr_id, dev, sp);
+ +}
   #endif
   
   static bool mlx5e_tunnel_proto_supported_tx(struct mlx5_core_dev *mdev, u8 proto_type)
@@@ -4734,8 -4702,10 +4753,10 @@@ static int mlx5e_xdp_set(struct net_dev
                 struct mlx5e_channel *c = priv->channels.c[i];
   
                 mlx5e_rq_replace_xdp_prog(&c->rq, prog);
-               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) {
+                       bpf_prog_inc(prog);
                         mlx5e_rq_replace_xdp_prog(&c->xskrq, prog);
+               }
         }
   
   unlock:
@@@ -4848,8 -4818,6 +4869,8 @@@ const struct net_device_ops mlx5e_netde
         .ndo_get_vf_config       = mlx5e_get_vf_config,
         .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
         .ndo_get_vf_stats        = mlx5e_get_vf_stats,
+ +      .ndo_has_offload_stats   = mlx5e_has_offload_stats,
+ +      .ndo_get_offload_stats   = mlx5e_get_offload_stats,
   #endif
         .ndo_get_devlink_port    = mlx5e_get_devlink_port,
   };
@@@ -5011,6 -4979,11 +5032,11 @@@ void mlx5e_build_nic_params(struct mlx5
                                      priv->max_nch);
         params->num_tc       = 1;
   
+       /* Set an initial non-zero value, so that mlx5e_select_queue won't
+        * divide by zero if called before first activating channels.
+        */
+       priv->num_tc_x_num_ch = params->num_channels * params->num_tc;
+ 
         /* SQ */
         params->log_sq_size = is_kdump_kernel() ?
                 MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
@@@ -5306,6 -5279,10 +5332,6 @@@ static int mlx5e_nic_init(struct mlx5_c
         if (err)
                 mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
   
- -      err = mlx5e_devlink_port_register(priv);
- -      if (err)
- -              mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
- -
         mlx5e_health_create_reporters(priv);
   
         return 0;
@@@ -5314,6 -5291,7 +5340,6 @@@
   static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
   {
         mlx5e_health_destroy_reporters(priv);
- -      mlx5e_devlink_port_unregister(priv);
         mlx5e_tls_cleanup(priv);
         mlx5e_ipsec_cleanup(priv);
   }
@@@ -5453,7 -5431,7 +5479,7 @@@ static void mlx5e_nic_enable(struct mlx
                 return;
         mlx5e_dcbnl_init_app(priv);
   
- -      queue_work(priv->wq, &priv->set_rx_mode_work);
+ +      mlx5e_nic_set_rx_mode(priv);
   
         rtnl_lock();
         if (netif_running(netdev))
@@@ -5476,7 -5454,7 +5502,7 @@@ static void mlx5e_nic_disable(struct ml
         netif_device_detach(priv->netdev);
         rtnl_unlock();
   
- -      queue_work(priv->wq, &priv->set_rx_mode_work);
+ +      mlx5e_nic_set_rx_mode(priv);
   
         mlx5e_hv_vhca_stats_destroy(priv);
         if (mlx5e_monitor_counter_supported(priv))
@@@ -5522,8 -5500,6 +5548,6 @@@ int mlx5e_priv_init(struct mlx5e_priv *
                     struct net_device *netdev,
                     struct mlx5_core_dev *mdev)
   {
-       memset(priv, 0, sizeof(*priv));
- 
         /* priv init */
         priv->mdev        = mdev;
         priv->netdev      = netdev;
@@@ -5556,12 -5532,18 +5580,18 @@@ void mlx5e_priv_cleanup(struct mlx5e_pr
   {
         int i;
   
+       /* bail if change profile failed and also rollback failed */
+       if (!priv->mdev)
+               return;
+ 
         destroy_workqueue(priv->wq);
         free_cpumask_var(priv->scratchpad.cpumask);
   
         for (i = 0; i < priv->htb.max_qos_sqs; i++)
                 kfree(priv->htb.qos_sq_stats[i]);
         kvfree(priv->htb.qos_sq_stats);
+ 
+       memset(priv, 0, sizeof(*priv));
   }
   
   struct net_device *
@@@ -5678,11 -5660,10 +5708,10 @@@ void mlx5e_detach_netdev(struct mlx5e_p
   }
   
   static int
- mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
+ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev,
                             const struct mlx5e_profile *new_profile, void *new_ppriv)
   {
-       struct net_device *netdev = priv->netdev;
-       struct mlx5_core_dev *mdev = priv->mdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
         int err;
   
         err = mlx5e_priv_init(priv, netdev, mdev);
@@@ -5695,10 -5676,16 +5724,16 @@@
         priv->ppriv = new_ppriv;
         err = new_profile->init(priv->mdev, priv->netdev);
         if (err)
-               return err;
+               goto priv_cleanup;
         err = mlx5e_attach_netdev(priv);
         if (err)
-               new_profile->cleanup(priv);
+               goto profile_cleanup;
+       return err;
+ 
+ profile_cleanup:
+       new_profile->cleanup(priv);
+ priv_cleanup:
+       mlx5e_priv_cleanup(priv);
         return err;
   }
   
@@@ -5707,13 -5694,14 +5742,14 @@@ int mlx5e_netdev_change_profile(struct 
   {
         unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile);
         const struct mlx5e_profile *orig_profile = priv->profile;
+       struct net_device *netdev = priv->netdev;
+       struct mlx5_core_dev *mdev = priv->mdev;
         void *orig_ppriv = priv->ppriv;
         int err, rollback_err;
   
         /* sanity */
         if (new_max_nch != priv->max_nch) {
-               netdev_warn(priv->netdev,
-                           "%s: Replacing profile with different max channels\n",
+               netdev_warn(netdev, "%s: Replacing profile with different max channels\n",
                             __func__);
                 return -EINVAL;
         }
@@@ -5723,30 -5711,22 +5759,27 @@@
         priv->profile->cleanup(priv);
         mlx5e_priv_cleanup(priv);
   
-       err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv);
+       err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv);
         if (err) { /* roll back to original profile */
-               netdev_warn(priv->netdev, "%s: new profile init failed, %d\n",
-                           __func__, err);
+               netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err);
                 goto rollback;
         }
   
         return 0;
   
   rollback:
-       rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv);
-       if (rollback_err) {
-               netdev_err(priv->netdev,
-                          "%s: failed to rollback to orig profile, %d\n",
+       rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv);
+       if (rollback_err)
+               netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n",
                            __func__, rollback_err);
-       }
         return err;
   }
   
+ +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv)
+ +{
+ +      mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL);
+ +}
+ +
   void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
   {
         struct net_device *netdev = priv->netdev;
@@@ -5829,17 -5809,10 +5862,17 @@@ static int mlx5e_probe(struct auxiliary
   
         priv->profile = profile;
         priv->ppriv = NULL;
+ +
+ +      err = mlx5e_devlink_port_register(priv);
+ +      if (err) {
+ +              mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
+ +              goto err_destroy_netdev;
+ +      }
+ +
         err = profile->init(mdev, netdev);
         if (err) {
                 mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err);
- -              goto err_destroy_netdev;
+ +              goto err_devlink_cleanup;
         }
   
         err = mlx5e_resume(adev);
@@@ -5857,15 -5830,12 +5890,15 @@@
         mlx5e_devlink_port_type_eth_set(priv);
   
         mlx5e_dcbnl_init_app(priv);
+ +      mlx5_uplink_netdev_set(mdev, netdev);
         return 0;
   
   err_resume:
         mlx5e_suspend(adev, state);
   err_profile_cleanup:
         profile->cleanup(priv);
+ +err_devlink_cleanup:
+ +      mlx5e_devlink_port_unregister(priv);
   err_destroy_netdev:
         mlx5e_destroy_netdev(priv);
         return err;
@@@ -5880,7 -5850,6 +5913,7 @@@ static void mlx5e_remove(struct auxilia
         unregister_netdev(priv->netdev);
         mlx5e_suspend(adev, state);
         priv->profile->cleanup(priv);
+ +      mlx5e_devlink_port_unregister(priv);
         mlx5e_destroy_netdev(priv);
   }
   
@@@ -5906,18 -5875,18 +5939,18 @@@ int mlx5e_init(void
   
         mlx5e_ipsec_build_inverse_table();
         mlx5e_build_ptys2ethtool_map();
- -      ret = mlx5e_rep_init();
+ +      ret = auxiliary_driver_register(&mlx5e_driver);
         if (ret)
                 return ret;
   
- -      ret = auxiliary_driver_register(&mlx5e_driver);
+ +      ret = mlx5e_rep_init();
         if (ret)
- -              mlx5e_rep_cleanup();
+ +              auxiliary_driver_unregister(&mlx5e_driver);
         return ret;
   }
   
   void mlx5e_cleanup(void)
   {
- -      auxiliary_driver_unregister(&mlx5e_driver);
         mlx5e_rep_cleanup();
+ +      auxiliary_driver_unregister(&mlx5e_driver);
   }
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index b0604b1,249d890..f90894e
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@@ -52,7 -52,6 +52,7 @@@
   #include "en/health.h"
   #include "en/params.h"
   #include "devlink.h"
+ +#include "en/devlink.h"
   
   static struct sk_buff *
   mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
@@@ -501,7 -500,6 +501,6 @@@ static int mlx5e_alloc_rx_mpwqe(struct 
         struct mlx5e_icosq *sq = rq->icosq;
         struct mlx5_wq_cyc *wq = &sq->wq;
         struct mlx5e_umr_wqe *umr_wqe;
-       u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
         u16 pi;
         int err;
         int i;
@@@ -532,7 -530,8 +531,8 @@@
         umr_wqe->ctrl.opmod_idx_opcode =
                 cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
                             MLX5_OPCODE_UMR);
-       umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+       umr_wqe->uctrl.xlt_offset =
+               cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix)));
   
         sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
                 .wqe_type   = MLX5E_ICOSQ_WQE_UMR_RX,
@@@ -670,7 -669,6 +670,7 @@@ int mlx5e_poll_ico_cq(struct mlx5e_cq *
                                                  get_cqe_opcode(cqe));
                                 mlx5e_dump_error_cqe(&sq->cq, sq->sqn,
                                                      (struct mlx5_err_cqe *)cqe);
+ +                              mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs);
                                 if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
                                         queue_work(cq->priv->wq, &sq->recover_work);
                                 break;
@@@ -1824,7 -1822,6 +1824,7 @@@ static void mlx5e_trap_handle_rx_cqe(st
         struct mlx5e_priv *priv = netdev_priv(rq->netdev);
         struct mlx5_wq_cyc *wq = &rq->wqe.wq;
         struct mlx5e_wqe_frag_info *wi;
+ +      struct devlink_port *dl_port;
         struct sk_buff *skb;
         u32 cqe_bcnt;
         u16 trap_id;
@@@ -1847,8 -1844,7 +1847,8 @@@
         mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
         skb_push(skb, ETH_HLEN);
   
- -      mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
+ +      dl_port = mlx5e_devlink_get_dl_port(priv);
+ +      mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port);
         dev_kfree_skb_any(skb);
   
   free_wqe:
diff --combined drivers/net/ethernet/mellanox/mlx5/core/en_tc.c

index 730f33a,df2a0af..4bd882a
--- 1/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
@@@ -445,16 -445,12 +445,16 @@@ static void mlx5e_hairpin_destroy_trans
         mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
   }
   
- -static void mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
+ +static int mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
   {
- -      u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE], rqn;
+ +      u32 *indirection_rqt, rqn;
         struct mlx5e_priv *priv = hp->func_priv;
         int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
   
+ +      indirection_rqt = kzalloc(sz, GFP_KERNEL);
+ +      if (!indirection_rqt)
+ +              return -ENOMEM;
+ +
         mlx5e_build_default_indir_rqt(indirection_rqt, sz,
                                       hp->num_channels);
   
@@@ -466,9 -462,6 +466,9 @@@
                 rqn = hp->pair->rqn[ix];
                 MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
         }
+ +
+ +      kfree(indirection_rqt);
+ +      return 0;
   }
   
   static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
@@@ -489,15 -482,12 +489,15 @@@
         MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
         MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
   
- -      mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+ +      err = mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
+ +      if (err)
+ +              goto out;
   
         err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
         if (!err)
                 hp->indir_rqt.enabled = true;
   
+ +out:
         kvfree(in);
         return err;
   }
@@@ -1087,23 -1077,19 +1087,23 @@@ mlx5e_tc_offload_fdb_rules(struct mlx5_
         if (flow_flag_test(flow, CT)) {
                 mod_hdr_acts = &attr->parse_attr->mod_hdr_acts;
   
- -              return mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
+ +              rule = mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
                                                flow, spec, attr,
                                                mod_hdr_acts);
+ +      } else {
+ +              rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
         }
   
- -      rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
         if (IS_ERR(rule))
                 return rule;
   
         if (attr->esw_attr->split_count) {
                 flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
                 if (IS_ERR(flow->rule[1])) {
- -                      mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
+ +                      if (flow_flag_test(flow, CT))
+ +                              mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
+ +                      else
+ +                              mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
                         return flow->rule[1];
                 }
         }
@@@ -1961,10 -1947,6 +1961,10 @@@ static int __parse_cls_flower(struct ml
                                     misc_parameters);
         void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
                                     misc_parameters);
+ +      void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
+ +                                  misc_parameters_3);
+ +      void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value,
+ +                                  misc_parameters_3);
         struct flow_rule *rule = flow_cls_offload_flow_rule(f);
         struct flow_dissector *dissector = rule->match.dissector;
         u16 addr_type = 0;
@@@ -1994,7 -1976,6 +1994,7 @@@
               BIT(FLOW_DISSECTOR_KEY_CT) |
               BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
               BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
+ +            BIT(FLOW_DISSECTOR_KEY_ICMP) |
               BIT(FLOW_DISSECTOR_KEY_MPLS))) {
                 NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
                 netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n",
@@@ -2314,49 -2295,17 +2314,59 @@@
                 if (match.mask->flags)
                         *match_level = MLX5_MATCH_L4;
         }
+ +      if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) {
+ +              struct flow_match_icmp match;
   
+ +              flow_rule_match_icmp(rule, &match);
+ +              switch (ip_proto) {
+ +              case IPPROTO_ICMP:
+ +                      if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+ +                            MLX5_FLEX_PROTO_ICMP))
+ +                              return -EOPNOTSUPP;
+ +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type,
+ +                               match.mask->type);
+ +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type,
+ +                               match.key->type);
+ +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code,
+ +                               match.mask->code);
+ +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code,
+ +                               match.key->code);
+ +                      break;
+ +              case IPPROTO_ICMPV6:
+ +                      if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
+ +                            MLX5_FLEX_PROTO_ICMPV6))
+ +                              return -EOPNOTSUPP;
+ +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type,
+ +                               match.mask->type);
+ +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type,
+ +                               match.key->type);
+ +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code,
+ +                               match.mask->code);
+ +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code,
+ +                               match.key->code);
+ +                      break;
+ +              default:
+ +                      NL_SET_ERR_MSG_MOD(extack,
+ +                                         "Code and type matching only with ICMP and ICMPv6");
+ +                      netdev_err(priv->netdev,
+ +                                 "Code and type matching only with ICMP and ICMPv6\n");
+ +                      return -EINVAL;
+ +              }
+ +              if (match.mask->code || match.mask->type) {
+ +                      *match_level = MLX5_MATCH_L4;
+ +                      spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
+ +              }
+ +      }
+       /* Currenlty supported only for MPLS over UDP */
+       if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
+           !netif_is_bareudp(filter_dev)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Matching on MPLS is supported only for MPLS over UDP");
+               netdev_err(priv->netdev,
+                          "Matching on MPLS is supported only for MPLS over UDP\n");
+               return -EOPNOTSUPP;
+       }
+ 
         return 0;
   }
   
@@@ -2960,6 -2909,37 +2970,37 @@@ static int is_action_keys_supported(con
         return 0;
   }
   
+ static bool modify_tuple_supported(bool modify_tuple, bool ct_clear,
+                                  bool ct_flow, struct netlink_ext_ack *extack,
+                                  struct mlx5e_priv *priv,
+                                  struct mlx5_flow_spec *spec)
+ {
+       if (!modify_tuple || ct_clear)
+               return true;
+ 
+       if (ct_flow) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with non-clear ct()");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with non-clear ct()");
+               return false;
+       }
+ 
+       /* Add ct_state=-trk match so it will be offloaded for non ct flows
+        * (or after clear action), as otherwise, since the tuple is changed,
+        * we can't restore ct state
+        */
+       if (mlx5_tc_ct_add_no_trk_match(spec)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with ct matches and no ct(clear) action");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with ct matches and no ct(clear) action");
+               return false;
+       }
+ 
+       return true;
+ }
+ 
   static bool modify_header_match_supported(struct mlx5e_priv *priv,
                                           struct mlx5_flow_spec *spec,
                                           struct flow_action *flow_action,
@@@ -2998,18 -2978,9 +3039,9 @@@
                         return err;
         }
   
-       /* Add ct_state=-trk match so it will be offloaded for non ct flows
-        * (or after clear action), as otherwise, since the tuple is changed,
-        *  we can't restore ct state
-        */
-       if (!ct_clear && modify_tuple &&
-           mlx5_tc_ct_add_no_trk_match(spec)) {
-               NL_SET_ERR_MSG_MOD(extack,
-                                  "can't offload tuple modify header with ct matches");
-               netdev_info(priv->netdev,
-                           "can't offload tuple modify header with ct matches");
+       if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack,
+                                   priv, spec))
                 return false;
-       }
   
         ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
         if (modify_ip_header && ip_proto != IPPROTO_TCP &&
@@@ -3040,8 -3011,7 +3072,8 @@@ static bool actions_match_supported(str
         actions = flow->attr->action;
   
         if (mlx5e_is_eswitch_flow(flow)) {
- -              if (flow->attr->esw_attr->split_count && ct_flow) {
+ +              if (flow->attr->esw_attr->split_count && ct_flow &&
+ +                  !MLX5_CAP_GEN(flow->attr->esw_attr->in_mdev, reg_c_preserve)) {
                         /* All registers used by ct are cleared when using
                          * split rules.
                          */
@@@ -3841,7 -3811,6 +3873,7 @@@ static int parse_tc_fdb_actions(struct 
                                 return err;
   
                         flow_flag_set(flow, CT);
+ +                      esw_attr->split_count = esw_attr->out_count;
                         break;
                 default:
                         NL_SET_ERR_MSG_MOD(extack, "The offload action is not supported");
@@@ -3904,6 -3873,11 +3936,6 @@@
                         return -EOPNOTSUPP;
                 }
   
- -              if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
- -                      NL_SET_ERR_MSG_MOD(extack,
- -                                         "Mirroring goto chain rules isn't supported");
- -                      return -EOPNOTSUPP;
- -              }
                 attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
         }
   
@@@ -4323,11 -4297,6 +4355,11 @@@ int mlx5e_configure_flower(struct net_d
         struct mlx5e_tc_flow *flow;
         int err = 0;
   
+ +      if (!mlx5_esw_hold(priv->mdev))
+ +              return -EAGAIN;
+ +
+ +      mlx5_esw_get(priv->mdev);
+ +
         rcu_read_lock();
         flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params);
         if (flow) {
@@@ -4365,14 -4334,11 +4397,14 @@@ rcu_unlock
         if (err)
                 goto err_free;
   
+ +      mlx5_esw_release(priv->mdev);
         return 0;
   
   err_free:
         mlx5e_flow_put(priv, flow);
   out:
+ +      mlx5_esw_put(priv->mdev);
+ +      mlx5_esw_release(priv->mdev);
         return err;
   }
   
@@@ -4412,7 -4378,6 +4444,7 @@@ int mlx5e_delete_flower(struct net_devi
         trace_mlx5e_delete_flower(f);
         mlx5e_flow_put(priv, flow);
   
+ +      mlx5_esw_put(priv->mdev);
         return 0;
   
   errout:
@@@ -4512,7 -4477,8 +4544,8 @@@ static int apply_police_params(struct m
          */
         if (rate) {
                 rate = (rate * BITS_PER_BYTE) + 500000;
-               rate_mbps = max_t(u64, do_div(rate, 1000000), 1);
+               do_div(rate, 1000000);
+               rate_mbps = max_t(u32, rate, 1);
         }
   
         err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
@@@ -4547,10 -4513,6 +4580,10 @@@ static int scan_tc_matchall_fdb_actions
         flow_action_for_each(i, act, flow_action) {
                 switch (act->id) {
                 case FLOW_ACTION_POLICE:
+ +                      if (act->police.rate_pkt_ps) {
+ +                              NL_SET_ERR_MSG_MOD(extack, "QoS offload not support packets per second");
+ +                              return -EOPNOTSUPP;
+ +                      }
                         err = apply_police_params(priv, act->police.rate_bytes_ps, extack);
                         if (err)
                                 return err;
@@@ -4717,6 -4679,10 +4750,6 @@@ int mlx5e_tc_nic_init(struct mlx5e_pri
   
         tc->ct = mlx5_tc_ct_init(priv, tc->chains, &priv->fs.tc.mod_hdr,
                                  MLX5_FLOW_NAMESPACE_KERNEL);
- -      if (IS_ERR(tc->ct)) {
- -              err = PTR_ERR(tc->ct);
- -              goto err_ct;
- -      }
   
         tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event;
         err = register_netdevice_notifier_dev_net(priv->netdev,
@@@ -4732,6 -4698,7 +4765,6 @@@
   
   err_reg:
         mlx5_tc_ct_clean(tc->ct);
- -err_ct:
         mlx5_chains_destroy(tc->chains);
   err_chains:
         rhashtable_destroy(&tc->ht);
@@@ -4790,6 -4757,8 +4823,6 @@@ int mlx5e_tc_esw_init(struct rhashtabl
                                                esw_chains(esw),
                                                &esw->offloads.mod_hdr,
                                                MLX5_FLOW_NAMESPACE_FDB);
- -      if (IS_ERR(uplink_priv->ct_priv))
- -              goto err_ct;
   
         mapping = mapping_create(sizeof(struct tunnel_match_key),
                                  TUNNEL_INFO_BITS_MASK, true);
@@@ -4829,6 -4798,7 +4862,6 @@@ err_enc_opts_mapping
         mapping_destroy(uplink_priv->tunnel_mapping);
   err_tun_mapping:
         mlx5_tc_ct_clean(uplink_priv->ct_priv);
- -err_ct:
         netdev_warn(priv->netdev,
                     "Failed to initialize tc (eswitch), err: %d", err);
         return err;
@@@ -4901,17 -4871,9 +4934,17 @@@ static int mlx5e_setup_tc_cls_flower(st
   int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
                             void *cb_priv)
   {
- -      unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
+ +      unsigned long flags = MLX5_TC_FLAG(INGRESS);
         struct mlx5e_priv *priv = cb_priv;
   
+ +      if (!priv->netdev || !netif_device_present(priv->netdev))
+ +              return -EOPNOTSUPP;
+ +
+ +      if (mlx5e_is_uplink_rep(priv))
+ +              flags |= MLX5_TC_FLAG(ESW_OFFLOAD);
+ +      else
+ +              flags |= MLX5_TC_FLAG(NIC_OFFLOAD);
+ +
         switch (type) {
         case TC_SETUP_CLSFLOWER:
                 return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
diff --combined drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c

index ab26948,8694b83..d5de6bf
--- 1/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
@@@ -40,6 -40,7 +40,6 @@@
   #include "eswitch.h"
   #include "esw/indir_table.h"
   #include "esw/acl/ofld.h"
- -#include "esw/indir_table.h"
   #include "rdma.h"
   #include "en.h"
   #include "fs_core.h"
@@@ -550,7 -551,8 +550,8 @@@ esw_setup_dests(struct mlx5_flow_destin
   
         if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
             MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) &&
-           mlx5_eswitch_vport_match_metadata_enabled(esw))
+           mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+           MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level))
                 attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
   
         if (attr->dest_ft) {
@@@ -1445,7 -1447,7 +1446,7 @@@ esw_add_restore_rule(struct mlx5_eswitc
         if (!mlx5_eswitch_reg_c1_loopback_supported(esw))
                 return ERR_PTR(-EOPNOTSUPP);
   
- -      spec = kzalloc(sizeof(*spec), GFP_KERNEL);
+ +      spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
         if (!spec)
                 return ERR_PTR(-ENOMEM);
   
@@@ -1468,7 -1470,7 +1469,7 @@@
         dest.ft = esw->offloads.ft_offloads;
   
         flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
- -      kfree(spec);
+ +      kvfree(spec);
   
         if (IS_ERR(flow_rule))
                 esw_warn(esw->dev,
@@@ -1853,7 -1855,6 +1854,7 @@@ static void esw_destroy_offloads_fdb_ta
         /* Holds true only as long as DMFS is the default */
         mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns,
                                      MLX5_FLOW_STEERING_MODE_DMFS);
+ +      atomic64_set(&esw->user_count, 0);
   }
   
   static int esw_create_offloads_table(struct mlx5_eswitch *esw)
@@@ -2259,11 -2260,9 +2260,11 @@@ int esw_offloads_load_rep(struct mlx5_e
         if (esw->mode != MLX5_ESWITCH_OFFLOADS)
                 return 0;
   
- -      err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
- -      if (err)
- -              return err;
+ +      if (vport_num != MLX5_VPORT_UPLINK) {
+ +              err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
+ +              if (err)
+ +                      return err;
+ +      }
   
         err = mlx5_esw_offloads_rep_load(esw, vport_num);
         if (err)
@@@ -2271,8 -2270,7 +2272,8 @@@
         return err;
   
   load_err:
- -      mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+ +      if (vport_num != MLX5_VPORT_UPLINK)
+ +              mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
         return err;
   }
   
@@@ -2282,9 -2280,7 +2283,9 @@@ void esw_offloads_unload_rep(struct mlx
                 return;
   
         mlx5_esw_offloads_rep_unload(esw, vport_num);
- -      mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
+ +
+ +      if (vport_num != MLX5_VPORT_UPLINK)
+ +              mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
   }
   
   #define ESW_OFFLOADS_DEVCOM_PAIR      (0)
@@@ -2559,9 -2555,6 +2560,9 @@@ static int esw_create_uplink_offloads_a
         struct mlx5_vport *vport;
   
         vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
+ +      if (IS_ERR(vport))
+ +              return PTR_ERR(vport);
+ +
         return esw_vport_create_offloads_acl_tables(esw, vport);
   }
   
@@@ -2570,9 -2563,6 +2571,9 @@@ static void esw_destroy_uplink_offloads
         struct mlx5_vport *vport;
   
         vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
+ +      if (IS_ERR(vport))
+ +              return;
+ +
         esw_vport_destroy_offloads_acl_tables(esw, vport);
   }
   
@@@ -2584,7 -2574,6 +2585,7 @@@ static int esw_offloads_steering_init(s
         memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb));
         mutex_init(&esw->fdb_table.offloads.vports.lock);
         hash_init(esw->fdb_table.offloads.vports.table);
+ +      atomic64_set(&esw->user_count, 0);
   
         indir = mlx5_esw_indir_table_init();
         if (IS_ERR(indir)) {
@@@ -2926,14 -2915,8 +2927,14 @@@ int mlx5_devlink_eswitch_mode_set(struc
         if (esw_mode_from_devlink(mode, &mlx5_mode))
                 return -EINVAL;
   
- -      mutex_lock(&esw->mode_lock);
- -      cur_mlx5_mode = esw->mode;
+ +      err = mlx5_esw_try_lock(esw);
+ +      if (err < 0) {
+ +              NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
+ +              return err;
+ +      }
+ +      cur_mlx5_mode = err;
+ +      err = 0;
+ +
         if (cur_mlx5_mode == mlx5_mode)
                 goto unlock;
   
@@@ -2945,7 -2928,7 +2946,7 @@@
                 err = -EINVAL;
   
   unlock:
- -      mutex_unlock(&esw->mode_lock);
+ +      mlx5_esw_unlock(esw);
         return err;
   }
   
@@@ -2958,14 -2941,14 +2959,14 @@@ int mlx5_devlink_eswitch_mode_get(struc
         if (IS_ERR(esw))
                 return PTR_ERR(esw);
   
- -      mutex_lock(&esw->mode_lock);
+ +      down_write(&esw->mode_lock);
         err = eswitch_devlink_esw_mode_check(esw);
         if (err)
                 goto unlock;
   
         err = esw_mode_to_devlink(esw->mode, mode);
   unlock:
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return err;
   }
   
@@@ -2981,7 -2964,7 +2982,7 @@@ int mlx5_devlink_eswitch_inline_mode_se
         if (IS_ERR(esw))
                 return PTR_ERR(esw);
   
- -      mutex_lock(&esw->mode_lock);
+ +      down_write(&esw->mode_lock);
         err = eswitch_devlink_esw_mode_check(esw);
         if (err)
                 goto out;
@@@ -3020,7 -3003,7 +3021,7 @@@
         }
   
         esw->offloads.inline_mode = mlx5_mode;
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return 0;
   
   revert_inline_mode:
@@@ -3030,7 -3013,7 +3031,7 @@@
                                                  vport,
                                                  esw->offloads.inline_mode);
   out:
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return err;
   }
   
@@@ -3043,14 -3026,14 +3044,14 @@@ int mlx5_devlink_eswitch_inline_mode_ge
         if (IS_ERR(esw))
                 return PTR_ERR(esw);
   
- -      mutex_lock(&esw->mode_lock);
+ +      down_write(&esw->mode_lock);
         err = eswitch_devlink_esw_mode_check(esw);
         if (err)
                 goto unlock;
   
         err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
   unlock:
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return err;
   }
   
@@@ -3066,7 -3049,7 +3067,7 @@@ int mlx5_devlink_eswitch_encap_mode_set
         if (IS_ERR(esw))
                 return PTR_ERR(esw);
   
- -      mutex_lock(&esw->mode_lock);
+ +      down_write(&esw->mode_lock);
         err = eswitch_devlink_esw_mode_check(esw);
         if (err)
                 goto unlock;
@@@ -3112,7 -3095,7 +3113,7 @@@
         }
   
   unlock:
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return err;
   }
   
@@@ -3127,14 -3110,14 +3128,14 @@@ int mlx5_devlink_eswitch_encap_mode_get
                 return PTR_ERR(esw);
   
   
- -      mutex_lock(&esw->mode_lock);
+ +      down_write(&esw->mode_lock);
         err = eswitch_devlink_esw_mode_check(esw);
         if (err)
                 goto unlock;
   
         *encap = esw->offloads.encap;
   unlock:
- -      mutex_unlock(&esw->mode_lock);
+ +      up_write(&esw->mode_lock);
         return 0;
   }
   
diff --combined drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c

index 0fc055c,6f7cef4..4830328
--- 1/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
@@@ -233,6 -233,7 +233,7 @@@ int mlx5i_create_underlay_qp(struct mlx
         }
   
         qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev));
         MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
         MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
         MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
@@@ -694,6 -695,7 +695,7 @@@ static int mlx5i_check_required_hca_cap
   static void mlx5_rdma_netdev_free(struct net_device *netdev)
   {
         struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+       struct mlx5_core_dev *mdev = priv->mdev;
         struct mlx5i_priv *ipriv = priv->ppriv;
         const struct mlx5e_profile *profile = priv->profile;
   
@@@ -702,13 -704,13 +704,13 @@@
   
         if (!ipriv->sub_interface) {
                 mlx5i_pkey_qpn_ht_cleanup(netdev);
-               mlx5e_destroy_mdev_resources(priv->mdev);
+               mlx5e_destroy_mdev_resources(mdev);
         }
   }
   
   static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
   {
- -      return mdev->mlx5e_res.pdn != 0;
+ +      return mdev->mlx5e_res.hw_objs.pdn != 0;
   }
   
   static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
diff --combined drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c

index 3c8a00d,a5a0f60..c9bddde
--- 1/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
@@@ -5,7 -5,8 +5,9 @@@
   #include "priv.h"
   #include "sf.h"
   #include "mlx5_ifc_vhca_event.h"
+ +#include "ecpf.h"
+ #include "vhca_event.h"
+ #include "mlx5_core.h"
   
   struct mlx5_sf_hw {
         u32 usr_sfnum;
@@@ -17,7 -18,6 +19,6 @@@ struct mlx5_sf_hw_table 
         struct mlx5_core_dev *dev;
         struct mlx5_sf_hw *sfs;
         int max_local_functions;
-       u8 ecpu: 1;
         struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */
         struct notifier_block vhca_nb;
   };
@@@ -63,7 -63,7 +64,7 @@@ int mlx5_sf_hw_table_sf_alloc(struct ml
         }
         if (sw_id == -ENOSPC) {
                 err = -ENOSPC;
-               goto err;
+               goto exist_err;
         }
   
         hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id);
@@@ -71,7 -71,7 +72,7 @@@
         if (err)
                 goto err;
   
-       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum);
+       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum);
         if (err)
                 goto vhca_err;
   
@@@ -117,7 -117,7 +118,7 @@@ void mlx5_sf_hw_table_sf_deferred_free(
   
         hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id);
         mutex_lock(&table->table_lock);
-       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, table->ecpu, out, sizeof(out));
+       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out));
         if (err)
                 goto err;
         state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state);
@@@ -163,7 -163,6 +164,6 @@@ int mlx5_sf_hw_table_init(struct mlx5_c
         table->dev = dev;
         table->sfs = sfs;
         table->max_local_functions = max_functions;
-       table->ecpu = mlx5_read_embedded_cpu(dev);
         dev->priv.sf_hw_table = table;
         mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions);
         return 0;
diff --combined drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c

index 8159516,9143ec3..616ebc3
--- 1/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
--- 2/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
@@@ -264,8 -264,8 +264,8 @@@ static void dr_ste_v1_set_miss_addr(u8 
   static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p)
   {
         u64 index =
-               (MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
-                MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32) << 26);
+               ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
+                ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26);
   
         return index << 6;
   }
@@@ -437,6 -437,21 +437,6 @@@ static void dr_ste_v1_set_rx_decap(u8 *
         dr_ste_v1_set_reparse(hw_ste_p);
   }
   
- -static void dr_ste_v1_set_rx_decap_l3(u8 *hw_ste_p,
- -                                    u8 *s_action,
- -                                    u16 decap_actions,
- -                                    u32 decap_index)
- -{
- -      MLX5_SET(ste_single_action_modify_list_v1, s_action, action_id,
- -               DR_STE_V1_ACTION_ID_MODIFY_LIST);
- -      MLX5_SET(ste_single_action_modify_list_v1, s_action, num_of_modify_actions,
- -               decap_actions);
- -      MLX5_SET(ste_single_action_modify_list_v1, s_action, modify_actions_ptr,
- -               decap_index);
- -
- -      dr_ste_v1_set_reparse(hw_ste_p);
- -}
- -
   static void dr_ste_v1_set_rewrite_actions(u8 *hw_ste_p,
                                           u8 *s_action,
                                           u16 num_of_actions,
@@@ -556,6 -571,9 +556,6 @@@ static void dr_ste_v1_set_actions_rx(st
         bool allow_ctr = true;
   
         if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) {
- -              dr_ste_v1_set_rx_decap_l3(last_ste, action,
- -                                        attr->decap_actions,
- -                                        attr->decap_index);
                 dr_ste_v1_set_rewrite_actions(last_ste, action,
                                               attr->decap_actions,
                                               attr->decap_index);
@@@ -1514,7 -1532,6 +1514,7 @@@ static void dr_ste_v1_build_src_gvmi_qp
   
         DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_gvmi, misc_mask, source_port);
         DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_qp, misc_mask, source_sqn);
+ +      misc_mask->source_eswitch_owner_vhca_id = 0;
   }
   
   static int dr_ste_v1_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value,
diff --combined drivers/net/ethernet/pensando/ionic/ionic_txrx.c

index 03e00a6,4087311..5985f7c
--- 1/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
--- 2/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
+++ b/drivers/net/ethernet/pensando/ionic/ionic_txrx.c
@@@ -10,6 -10,12 +10,6 @@@
   #include "ionic_lif.h"
   #include "ionic_txrx.h"
   
- -static void ionic_rx_clean(struct ionic_queue *q,
- -                         struct ionic_desc_info *desc_info,
- -                         struct ionic_cq_info *cq_info,
- -                         void *cb_arg);
- -
- -static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
   
   static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
   
@@@ -34,149 -40,72 +34,149 @@@ static inline struct netdev_queue *q_to
         return netdev_get_tx_queue(q->lif->netdev, q->index);
   }
   
- -static struct sk_buff *ionic_rx_skb_alloc(struct ionic_queue *q,
- -                                        unsigned int len, bool frags)
+ +static void ionic_rx_buf_reset(struct ionic_buf_info *buf_info)
+ +{
+ +      buf_info->page = NULL;
+ +      buf_info->page_offset = 0;
+ +      buf_info->dma_addr = 0;
+ +}
+ +
+ +static int ionic_rx_page_alloc(struct ionic_queue *q,
+ +                             struct ionic_buf_info *buf_info)
   {
- -      struct ionic_lif *lif = q->lif;
+ +      struct net_device *netdev = q->lif->netdev;
         struct ionic_rx_stats *stats;
- -      struct net_device *netdev;
- -      struct sk_buff *skb;
+ +      struct device *dev;
   
- -      netdev = lif->netdev;
- -      stats = &q->lif->rxqstats[q->index];
+ +      dev = q->dev;
+ +      stats = q_to_rx_stats(q);
   
- -      if (frags)
- -              skb = napi_get_frags(&q_to_qcq(q)->napi);
- -      else
- -              skb = netdev_alloc_skb_ip_align(netdev, len);
+ +      if (unlikely(!buf_info)) {
+ +              net_err_ratelimited("%s: %s invalid buf_info in alloc\n",
+ +                                  netdev->name, q->name);
+ +              return -EINVAL;
+ +      }
   
- -      if (unlikely(!skb)) {
- -              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
- -                                   netdev->name, q->name);
+ +      buf_info->page = alloc_pages(IONIC_PAGE_GFP_MASK, 0);
+ +      if (unlikely(!buf_info->page)) {
+ +              net_err_ratelimited("%s: %s page alloc failed\n",
+ +                                  netdev->name, q->name);
                 stats->alloc_err++;
- -              return NULL;
+ +              return -ENOMEM;
         }
+ +      buf_info->page_offset = 0;
   
- -      return skb;
+ +      buf_info->dma_addr = dma_map_page(dev, buf_info->page, buf_info->page_offset,
+ +                                        IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ +      if (unlikely(dma_mapping_error(dev, buf_info->dma_addr))) {
+ +              __free_pages(buf_info->page, 0);
+ +              ionic_rx_buf_reset(buf_info);
+ +              net_err_ratelimited("%s: %s dma map failed\n",
+ +                                  netdev->name, q->name);
+ +              stats->dma_map_err++;
+ +              return -EIO;
+ +      }
+ +
+ +      return 0;
+ +}
+ +
+ +static void ionic_rx_page_free(struct ionic_queue *q,
+ +                             struct ionic_buf_info *buf_info)
+ +{
+ +      struct net_device *netdev = q->lif->netdev;
+ +      struct device *dev = q->dev;
+ +
+ +      if (unlikely(!buf_info)) {
+ +              net_err_ratelimited("%s: %s invalid buf_info in free\n",
+ +                                  netdev->name, q->name);
+ +              return;
+ +      }
+ +
+ +      if (!buf_info->page)
+ +              return;
+ +
+ +      dma_unmap_page(dev, buf_info->dma_addr, IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ +      __free_pages(buf_info->page, 0);
+ +      ionic_rx_buf_reset(buf_info);
+ +}
+ +
+ +static bool ionic_rx_buf_recycle(struct ionic_queue *q,
+ +                               struct ionic_buf_info *buf_info, u32 used)
+ +{
+ +      u32 size;
+ +
+ +      /* don't re-use pages allocated in low-mem condition */
+ +      if (page_is_pfmemalloc(buf_info->page))
+ +              return false;
+ +
+ +      /* don't re-use buffers from non-local numa nodes */
+ +      if (page_to_nid(buf_info->page) != numa_mem_id())
+ +              return false;
+ +
+ +      size = ALIGN(used, IONIC_PAGE_SPLIT_SZ);
+ +      buf_info->page_offset += size;
+ +      if (buf_info->page_offset >= IONIC_PAGE_SIZE)
+ +              return false;
+ +
+ +      get_page(buf_info->page);
+ +
+ +      return true;
   }
   
   static struct sk_buff *ionic_rx_frags(struct ionic_queue *q,
                                       struct ionic_desc_info *desc_info,
- -                                    struct ionic_cq_info *cq_info)
+ +                                    struct ionic_rxq_comp *comp)
   {
- -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
- -      struct device *dev = q->lif->ionic->dev;
- -      struct ionic_page_info *page_info;
+ +      struct net_device *netdev = q->lif->netdev;
+ +      struct ionic_buf_info *buf_info;
+ +      struct ionic_rx_stats *stats;
+ +      struct device *dev = q->dev;
         struct sk_buff *skb;
         unsigned int i;
         u16 frag_len;
         u16 len;
   
- -      page_info = &desc_info->pages[0];
+ +      stats = q_to_rx_stats(q);
+ +
+ +      buf_info = &desc_info->bufs[0];
         len = le16_to_cpu(comp->len);
   
- -      prefetch(page_address(page_info->page) + NET_IP_ALIGN);
+ +      prefetch(buf_info->page);
   
- -      skb = ionic_rx_skb_alloc(q, len, true);
- -      if (unlikely(!skb))
+ +      skb = napi_get_frags(&q_to_qcq(q)->napi);
+ +      if (unlikely(!skb)) {
+ +              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
+ +                                   netdev->name, q->name);
+ +              stats->alloc_err++;
                 return NULL;
+ +      }
   
         i = comp->num_sg_elems + 1;
         do {
- -              if (unlikely(!page_info->page)) {
- -                      struct napi_struct *napi = &q_to_qcq(q)->napi;
- -
- -                      napi->skb = NULL;
+ +              if (unlikely(!buf_info->page)) {
                         dev_kfree_skb(skb);
                         return NULL;
                 }
   
- -              frag_len = min(len, (u16)PAGE_SIZE);
+ +              frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
                 len -= frag_len;
   
- -              dma_unmap_page(dev, dma_unmap_addr(page_info, dma_addr),
- -                             PAGE_SIZE, DMA_FROM_DEVICE);
+ +              dma_sync_single_for_cpu(dev,
+ +                                      buf_info->dma_addr + buf_info->page_offset,
+ +                                      frag_len, DMA_FROM_DEVICE);
+ +
                 skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
- -                              page_info->page, 0, frag_len, PAGE_SIZE);
- -              page_info->page = NULL;
- -              page_info++;
+ +                              buf_info->page, buf_info->page_offset, frag_len,
+ +                              IONIC_PAGE_SIZE);
+ +
+ +              if (!ionic_rx_buf_recycle(q, buf_info, frag_len)) {
+ +                      dma_unmap_page(dev, buf_info->dma_addr,
+ +                                     IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
+ +                      ionic_rx_buf_reset(buf_info);
+ +              }
+ +
+ +              buf_info++;
+ +
                 i--;
         } while (i > 0);
   
@@@ -185,37 -114,30 +185,37 @@@
   
   static struct sk_buff *ionic_rx_copybreak(struct ionic_queue *q,
                                           struct ionic_desc_info *desc_info,
- -                                        struct ionic_cq_info *cq_info)
+ +                                        struct ionic_rxq_comp *comp)
   {
- -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
- -      struct device *dev = q->lif->ionic->dev;
- -      struct ionic_page_info *page_info;
+ +      struct net_device *netdev = q->lif->netdev;
+ +      struct ionic_buf_info *buf_info;
+ +      struct ionic_rx_stats *stats;
+ +      struct device *dev = q->dev;
         struct sk_buff *skb;
         u16 len;
   
- -      page_info = &desc_info->pages[0];
+ +      stats = q_to_rx_stats(q);
+ +
+ +      buf_info = &desc_info->bufs[0];
         len = le16_to_cpu(comp->len);
   
- -      skb = ionic_rx_skb_alloc(q, len, false);
- -      if (unlikely(!skb))
+ +      skb = napi_alloc_skb(&q_to_qcq(q)->napi, len);
+ +      if (unlikely(!skb)) {
+ +              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
+ +                                   netdev->name, q->name);
+ +              stats->alloc_err++;
                 return NULL;
+ +      }
   
- -      if (unlikely(!page_info->page)) {
+ +      if (unlikely(!buf_info->page)) {
                 dev_kfree_skb(skb);
                 return NULL;
         }
   
- -      dma_sync_single_for_cpu(dev, dma_unmap_addr(page_info, dma_addr),
+ +      dma_sync_single_for_cpu(dev, buf_info->dma_addr + buf_info->page_offset,
                                 len, DMA_FROM_DEVICE);
- -      skb_copy_to_linear_data(skb, page_address(page_info->page), len);
- -      dma_sync_single_for_device(dev, dma_unmap_addr(page_info, dma_addr),
+ +      skb_copy_to_linear_data(skb, page_address(buf_info->page) + buf_info->page_offset, len);
+ +      dma_sync_single_for_device(dev, buf_info->dma_addr + buf_info->page_offset,
                                    len, DMA_FROM_DEVICE);
   
         skb_put(skb, len);
@@@ -229,13 -151,14 +229,13 @@@ static void ionic_rx_clean(struct ionic
                            struct ionic_cq_info *cq_info,
                            void *cb_arg)
   {
- -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
+ +      struct ionic_rxq_comp *comp = cq_info->rxcq;
+ +      struct net_device *netdev = q->lif->netdev;
         struct ionic_qcq *qcq = q_to_qcq(q);
         struct ionic_rx_stats *stats;
- -      struct net_device *netdev;
         struct sk_buff *skb;
   
         stats = q_to_rx_stats(q);
- -      netdev = q->lif->netdev;
   
         if (comp->status) {
                 stats->dropped++;
@@@ -246,9 -169,9 +246,9 @@@
         stats->bytes += le16_to_cpu(comp->len);
   
         if (le16_to_cpu(comp->len) <= q->lif->rx_copybreak)
- -              skb = ionic_rx_copybreak(q, desc_info, cq_info);
+ +              skb = ionic_rx_copybreak(q, desc_info, comp);
         else
- -              skb = ionic_rx_frags(q, desc_info, cq_info);
+ +              skb = ionic_rx_frags(q, desc_info, comp);
   
         if (unlikely(!skb)) {
                 stats->dropped++;
@@@ -304,7 -227,7 +304,7 @@@
   
   static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
   {
- -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
+ +      struct ionic_rxq_comp *comp = cq_info->rxcq;
         struct ionic_queue *q = cq->bound_q;
         struct ionic_desc_info *desc_info;
   
@@@ -330,75 -253,138 +330,75 @@@
         return true;
   }
   
- -static int ionic_rx_page_alloc(struct ionic_queue *q,
- -                             struct ionic_page_info *page_info)
- -{
- -      struct ionic_lif *lif = q->lif;
- -      struct ionic_rx_stats *stats;
- -      struct net_device *netdev;
- -      struct device *dev;
- -
- -      netdev = lif->netdev;
- -      dev = lif->ionic->dev;
- -      stats = q_to_rx_stats(q);
- -
- -      if (unlikely(!page_info)) {
- -              net_err_ratelimited("%s: %s invalid page_info in alloc\n",
- -                                  netdev->name, q->name);
- -              return -EINVAL;
- -      }
- -
- -      page_info->page = dev_alloc_page();
- -      if (unlikely(!page_info->page)) {
- -              net_err_ratelimited("%s: %s page alloc failed\n",
- -                                  netdev->name, q->name);
- -              stats->alloc_err++;
- -              return -ENOMEM;
- -      }
- -
- -      page_info->dma_addr = dma_map_page(dev, page_info->page, 0, PAGE_SIZE,
- -                                         DMA_FROM_DEVICE);
- -      if (unlikely(dma_mapping_error(dev, page_info->dma_addr))) {
- -              put_page(page_info->page);
- -              page_info->dma_addr = 0;
- -              page_info->page = NULL;
- -              net_err_ratelimited("%s: %s dma map failed\n",
- -                                  netdev->name, q->name);
- -              stats->dma_map_err++;
- -              return -EIO;
- -      }
- -
- -      return 0;
- -}
- -
- -static void ionic_rx_page_free(struct ionic_queue *q,
- -                             struct ionic_page_info *page_info)
- -{
- -      struct ionic_lif *lif = q->lif;
- -      struct net_device *netdev;
- -      struct device *dev;
- -
- -      netdev = lif->netdev;
- -      dev = lif->ionic->dev;
- -
- -      if (unlikely(!page_info)) {
- -              net_err_ratelimited("%s: %s invalid page_info in free\n",
- -                                  netdev->name, q->name);
- -              return;
- -      }
- -
- -      if (unlikely(!page_info->page)) {
- -              net_err_ratelimited("%s: %s invalid page in free\n",
- -                                  netdev->name, q->name);
- -              return;
- -      }
- -
- -      dma_unmap_page(dev, page_info->dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
- -
- -      put_page(page_info->page);
- -      page_info->dma_addr = 0;
- -      page_info->page = NULL;
- -}
- -
   void ionic_rx_fill(struct ionic_queue *q)
   {
         struct net_device *netdev = q->lif->netdev;
         struct ionic_desc_info *desc_info;
- -      struct ionic_page_info *page_info;
         struct ionic_rxq_sg_desc *sg_desc;
         struct ionic_rxq_sg_elem *sg_elem;
+ +      struct ionic_buf_info *buf_info;
         struct ionic_rxq_desc *desc;
         unsigned int remain_len;
- -      unsigned int seg_len;
+ +      unsigned int frag_len;
         unsigned int nfrags;
         unsigned int i, j;
         unsigned int len;
   
         len = netdev->mtu + ETH_HLEN + VLAN_HLEN;
- -      nfrags = round_up(len, PAGE_SIZE) / PAGE_SIZE;
   
         for (i = ionic_q_space_avail(q); i; i--) {
+ +              nfrags = 0;
                 remain_len = len;
                 desc_info = &q->info[q->head_idx];
                 desc = desc_info->desc;
- -              sg_desc = desc_info->sg_desc;
- -              page_info = &desc_info->pages[0];
+ +              buf_info = &desc_info->bufs[0];
   
- -              if (page_info->page) { /* recycle the buffer */
- -                      ionic_rxq_post(q, false, ionic_rx_clean, NULL);
- -                      continue;
- -              }
- -
- -              /* fill main descriptor - pages[0] */
- -              desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
- -                                            IONIC_RXQ_DESC_OPCODE_SIMPLE;
- -              desc_info->npages = nfrags;
- -              if (unlikely(ionic_rx_page_alloc(q, page_info))) {
- -                      desc->addr = 0;
- -                      desc->len = 0;
- -                      return;
+ +              if (!buf_info->page) { /* alloc a new buffer? */
+ +                      if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
+ +                              desc->addr = 0;
+ +                              desc->len = 0;
+ +                              return;
+ +                      }
                 }
- -              desc->addr = cpu_to_le64(page_info->dma_addr);
- -              seg_len = min_t(unsigned int, PAGE_SIZE, len);
- -              desc->len = cpu_to_le16(seg_len);
- -              remain_len -= seg_len;
- -              page_info++;
   
- -              /* fill sg descriptors - pages[1..n] */
- -              for (j = 0; j < nfrags - 1; j++) {
- -                      if (page_info->page) /* recycle the sg buffer */
- -                              continue;
+ +              /* fill main descriptor - buf[0] */
+ +              desc->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
+ +              frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
+ +              desc->len = cpu_to_le16(frag_len);
+ +              remain_len -= frag_len;
+ +              buf_info++;
+ +              nfrags++;
   
+ +              /* fill sg descriptors - buf[1..n] */
+ +              sg_desc = desc_info->sg_desc;
+ +              for (j = 0; remain_len > 0 && j < q->max_sg_elems; j++) {
                         sg_elem = &sg_desc->elems[j];
- -                      if (unlikely(ionic_rx_page_alloc(q, page_info))) {
- -                              sg_elem->addr = 0;
- -                              sg_elem->len = 0;
- -                              return;
+ +                      if (!buf_info->page) { /* alloc a new sg buffer? */
+ +                              if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
+ +                                      sg_elem->addr = 0;
+ +                                      sg_elem->len = 0;
+ +                                      return;
+ +                              }
                         }
- -                      sg_elem->addr = cpu_to_le64(page_info->dma_addr);
- -                      seg_len = min_t(unsigned int, PAGE_SIZE, remain_len);
- -                      sg_elem->len = cpu_to_le16(seg_len);
- -                      remain_len -= seg_len;
- -                      page_info++;
+ +
+ +                      sg_elem->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
+ +                      frag_len = min_t(u16, remain_len, IONIC_PAGE_SIZE - buf_info->page_offset);
+ +                      sg_elem->len = cpu_to_le16(frag_len);
+ +                      remain_len -= frag_len;
+ +                      buf_info++;
+ +                      nfrags++;
+ +              }
+ +
+ +              /* clear end sg element as a sentinel */
+ +              if (j < q->max_sg_elems) {
+ +                      sg_elem = &sg_desc->elems[j];
+ +                      memset(sg_elem, 0, sizeof(*sg_elem));
                 }
   
+ +              desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
+ +                                            IONIC_RXQ_DESC_OPCODE_SIMPLE;
+ +              desc_info->nbufs = nfrags;
+ +
                 ionic_rxq_post(q, false, ionic_rx_clean, NULL);
         }
   
@@@ -409,24 -395,21 +409,24 @@@
   void ionic_rx_empty(struct ionic_queue *q)
   {
         struct ionic_desc_info *desc_info;
- -      struct ionic_page_info *page_info;
+ +      struct ionic_buf_info *buf_info;
         unsigned int i, j;
   
         for (i = 0; i < q->num_descs; i++) {
                 desc_info = &q->info[i];
                 for (j = 0; j < IONIC_RX_MAX_SG_ELEMS + 1; j++) {
- -                      page_info = &desc_info->pages[j];
- -                      if (page_info->page)
- -                              ionic_rx_page_free(q, page_info);
+ +                      buf_info = &desc_info->bufs[j];
+ +                      if (buf_info->page)
+ +                              ionic_rx_page_free(q, buf_info);
                 }
   
- -              desc_info->npages = 0;
+ +              desc_info->nbufs = 0;
                 desc_info->cb = NULL;
                 desc_info->cb_arg = NULL;
         }
+ +
+ +      q->head_idx = 0;
+ +      q->tail_idx = 0;
   }
   
   static void ionic_dim_update(struct ionic_qcq *qcq)
@@@ -542,7 -525,7 +542,7 @@@ int ionic_txrx_napi(struct napi_struct 
         idev = &lif->ionic->idev;
         txcq = &lif->txqcqs[qi]->cq;
   
- -      tx_work_done = ionic_cq_service(txcq, lif->tx_budget,
+ +      tx_work_done = ionic_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT,
                                         ionic_tx_service, NULL, NULL);
   
         rx_work_done = ionic_cq_service(rxcq, budget,
@@@ -575,7 -558,7 +575,7 @@@ static dma_addr_t ionic_tx_map_single(s
                                       void *data, size_t len)
   {
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct device *dev = q->lif->ionic->dev;
+ +      struct device *dev = q->dev;
         dma_addr_t dma_addr;
   
         dma_addr = dma_map_single(dev, data, len, DMA_TO_DEVICE);
@@@ -593,7 -576,7 +593,7 @@@ static dma_addr_t ionic_tx_map_frag(str
                                     size_t offset, size_t len)
   {
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct device *dev = q->lif->ionic->dev;
+ +      struct device *dev = q->dev;
         dma_addr_t dma_addr;
   
         dma_addr = skb_frag_dma_map(dev, frag, offset, len, DMA_TO_DEVICE);
@@@ -605,72 -588,42 +605,72 @@@
         return dma_addr;
   }
   
+ +static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb,
+ +                          struct ionic_desc_info *desc_info)
+ +{
+ +      struct ionic_buf_info *buf_info = desc_info->bufs;
+ +      struct device *dev = q->dev;
+ +      dma_addr_t dma_addr;
+ +      unsigned int nfrags;
+ +      skb_frag_t *frag;
+ +      int frag_idx;
+ +
+ +      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
+ +      if (dma_mapping_error(dev, dma_addr))
+ +              return -EIO;
+ +      buf_info->dma_addr = dma_addr;
+ +      buf_info->len = skb_headlen(skb);
+ +      buf_info++;
+ +
+ +      frag = skb_shinfo(skb)->frags;
+ +      nfrags = skb_shinfo(skb)->nr_frags;
+ +      for (frag_idx = 0; frag_idx < nfrags; frag_idx++, frag++) {
+ +              dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag));
+ +              if (dma_mapping_error(dev, dma_addr))
+ +                      goto dma_fail;
+ +              buf_info->dma_addr = dma_addr;
+ +              buf_info->len = skb_frag_size(frag);
+ +              buf_info++;
+ +      }
+ +
+ +      desc_info->nbufs = 1 + nfrags;
+ +
+ +      return 0;
+ +
+ +dma_fail:
+ +      /* unwind the frag mappings and the head mapping */
+ +      while (frag_idx > 0) {
+ +              frag_idx--;
+ +              buf_info--;
+ +              dma_unmap_page(dev, buf_info->dma_addr,
+ +                             buf_info->len, DMA_TO_DEVICE);
+ +      }
+ +      dma_unmap_single(dev, buf_info->dma_addr, buf_info->len, DMA_TO_DEVICE);
+ +      return -EIO;
+ +}
+ +
   static void ionic_tx_clean(struct ionic_queue *q,
                            struct ionic_desc_info *desc_info,
                            struct ionic_cq_info *cq_info,
                            void *cb_arg)
   {
- -      struct ionic_txq_sg_desc *sg_desc = desc_info->sg_desc;
- -      struct ionic_txq_sg_elem *elem = sg_desc->elems;
+ +      struct ionic_buf_info *buf_info = desc_info->bufs;
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct ionic_txq_desc *desc = desc_info->desc;
- -      struct device *dev = q->lif->ionic->dev;
- -      u8 opcode, flags, nsge;
+ +      struct device *dev = q->dev;
         u16 queue_index;
         unsigned int i;
- -      u64 addr;
- -
- -      decode_txq_desc_cmd(le64_to_cpu(desc->cmd),
- -                          &opcode, &flags, &nsge, &addr);
- -
- -      /* use unmap_single only if either this is not TSO,
- -       * or this is first descriptor of a TSO
- -       */
- -      if (opcode != IONIC_TXQ_DESC_OPCODE_TSO ||
- -          flags & IONIC_TXQ_DESC_FLAG_TSO_SOT)
- -              dma_unmap_single(dev, (dma_addr_t)addr,
- -                               le16_to_cpu(desc->len), DMA_TO_DEVICE);
- -      else
- -              dma_unmap_page(dev, (dma_addr_t)addr,
- -                             le16_to_cpu(desc->len), DMA_TO_DEVICE);
   
- -      for (i = 0; i < nsge; i++, elem++)
- -              dma_unmap_page(dev, (dma_addr_t)le64_to_cpu(elem->addr),
- -                             le16_to_cpu(elem->len), DMA_TO_DEVICE);
+ +      if (desc_info->nbufs) {
+ +              dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
+ +                               buf_info->len, DMA_TO_DEVICE);
+ +              buf_info++;
+ +              for (i = 1; i < desc_info->nbufs; i++, buf_info++)
+ +                      dma_unmap_page(dev, (dma_addr_t)buf_info->dma_addr,
+ +                                     buf_info->len, DMA_TO_DEVICE);
+ +      }
   
         if (cb_arg) {
                 struct sk_buff *skb = cb_arg;
- -              u32 len = skb->len;
   
                 queue_index = skb_get_queue_mapping(skb);
                 if (unlikely(__netif_subqueue_stopped(q->lif->netdev,
@@@ -678,21 -631,17 +678,21 @@@
                         netif_wake_subqueue(q->lif->netdev, queue_index);
                         q->wake++;
                 }
- -              dev_kfree_skb_any(skb);
+ +
+ +              desc_info->bytes = skb->len;
                 stats->clean++;
- -              netdev_tx_completed_queue(q_to_ndq(q), 1, len);
+ +
+ +              dev_consume_skb_any(skb);
         }
   }
   
   static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
   {
- -      struct ionic_txq_comp *comp = cq_info->cq_desc;
+ +      struct ionic_txq_comp *comp = cq_info->txcq;
         struct ionic_queue *q = cq->bound_q;
         struct ionic_desc_info *desc_info;
+ +      int bytes = 0;
+ +      int pkts = 0;
         u16 index;
   
         if (!color_match(comp->color, cq->done_color))
@@@ -703,21 -652,13 +703,21 @@@
          */
         do {
                 desc_info = &q->info[q->tail_idx];
+ +              desc_info->bytes = 0;
                 index = q->tail_idx;
                 q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
                 ionic_tx_clean(q, desc_info, cq_info, desc_info->cb_arg);
+ +              if (desc_info->cb_arg) {
+ +                      pkts++;
+ +                      bytes += desc_info->bytes;
+ +              }
                 desc_info->cb = NULL;
                 desc_info->cb_arg = NULL;
         } while (index != le16_to_cpu(comp->comp_index));
   
+ +      if (pkts && bytes)
+ +              netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
+ +
         return true;
   }
   
@@@ -736,25 -677,15 +736,25 @@@ void ionic_tx_flush(struct ionic_cq *cq
   void ionic_tx_empty(struct ionic_queue *q)
   {
         struct ionic_desc_info *desc_info;
+ +      int bytes = 0;
+ +      int pkts = 0;
   
         /* walk the not completed tx entries, if any */
         while (q->head_idx != q->tail_idx) {
                 desc_info = &q->info[q->tail_idx];
+ +              desc_info->bytes = 0;
                 q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
                 ionic_tx_clean(q, desc_info, NULL, desc_info->cb_arg);
+ +              if (desc_info->cb_arg) {
+ +                      pkts++;
+ +                      bytes += desc_info->bytes;
+ +              }
                 desc_info->cb = NULL;
                 desc_info->cb_arg = NULL;
         }
+ +
+ +      if (pkts && bytes)
+ +              netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
   }
   
   static int ionic_tx_tcp_inner_pseudo_csum(struct sk_buff *skb)
@@@ -825,33 -756,50 +825,33 @@@ static void ionic_tx_tso_post(struct io
         desc->hdr_len = cpu_to_le16(hdrlen);
         desc->mss = cpu_to_le16(mss);
   
- -      if (done) {
+ +      if (start) {
                 skb_tx_timestamp(skb);
                 netdev_tx_sent_queue(q_to_ndq(q), skb->len);
- -              ionic_txq_post(q, !netdev_xmit_more(), ionic_tx_clean, skb);
+ +              ionic_txq_post(q, false, ionic_tx_clean, skb);
         } else {
- -              ionic_txq_post(q, false, ionic_tx_clean, NULL);
+ +              ionic_txq_post(q, done, NULL, NULL);
         }
   }
   
- -static struct ionic_txq_desc *ionic_tx_tso_next(struct ionic_queue *q,
- -                                              struct ionic_txq_sg_elem **elem)
- -{
- -      struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
- -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
- -
- -      *elem = sg_desc->elems;
- -      return desc;
- -}
- -
   static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
   {
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct ionic_desc_info *rewind_desc_info;
- -      struct device *dev = q->lif->ionic->dev;
+ +      struct ionic_desc_info *desc_info;
+ +      struct ionic_buf_info *buf_info;
         struct ionic_txq_sg_elem *elem;
         struct ionic_txq_desc *desc;
- -      unsigned int frag_left = 0;
- -      unsigned int offset = 0;
- -      u16 abort = q->head_idx;
- -      unsigned int len_left;
+ +      unsigned int chunk_len;
+ +      unsigned int frag_rem;
+ +      unsigned int tso_rem;
+ +      unsigned int seg_rem;
         dma_addr_t desc_addr;
+ +      dma_addr_t frag_addr;
         unsigned int hdrlen;
- -      unsigned int nfrags;
- -      unsigned int seglen;
- -      u64 total_bytes = 0;
- -      u64 total_pkts = 0;
- -      u16 rewind = abort;
- -      unsigned int left;
         unsigned int len;
         unsigned int mss;
- -      skb_frag_t *frag;
         bool start, done;
         bool outer_csum;
- -      dma_addr_t addr;
         bool has_vlan;
         u16 desc_len;
         u8 desc_nsge;
@@@ -859,14 -807,9 +859,14 @@@
         bool encap;
         int err;
   
+ +      desc_info = &q->info[q->head_idx];
+ +      buf_info = desc_info->bufs;
+ +
+ +      if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
+ +              return -EIO;
+ +
+ +      len = skb->len;
         mss = skb_shinfo(skb)->gso_size;
- -      nfrags = skb_shinfo(skb)->nr_frags;
- -      len_left = skb->len - skb_headlen(skb);
         outer_csum = (skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM) ||
                      (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
         has_vlan = !!skb_vlan_tag_present(skb);
@@@ -891,75 -834,125 +891,75 @@@
         else
                 hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
   
- -      seglen = hdrlen + mss;
- -      left = skb_headlen(skb);
+ +      tso_rem = len;
+ +      seg_rem = min(tso_rem, hdrlen + mss);
   
- -      desc = ionic_tx_tso_next(q, &elem);
- -      start = true;
+ +      frag_addr = 0;
+ +      frag_rem = 0;
   
- -      /* Chop skb->data up into desc segments */
+ +      start = true;
   
- -      while (left > 0) {
- -              len = min(seglen, left);
- -              frag_left = seglen - len;
- -              desc_addr = ionic_tx_map_single(q, skb->data + offset, len);
- -              if (dma_mapping_error(dev, desc_addr))
- -                      goto err_out_abort;
- -              desc_len = len;
+ +      while (tso_rem > 0) {
+ +              desc = NULL;
+ +              elem = NULL;
+ +              desc_addr = 0;
+ +              desc_len = 0;
                 desc_nsge = 0;
- -              left -= len;
- -              offset += len;
- -              if (nfrags > 0 && frag_left > 0)
- -                      continue;
- -              done = (nfrags == 0 && left == 0);
- -              ionic_tx_tso_post(q, desc, skb,
- -                                desc_addr, desc_nsge, desc_len,
- -                                hdrlen, mss,
- -                                outer_csum,
- -                                vlan_tci, has_vlan,
- -                                start, done);
- -              total_pkts++;
- -              total_bytes += start ? len : len + hdrlen;
- -              desc = ionic_tx_tso_next(q, &elem);
- -              start = false;
- -              seglen = mss;
- -      }
- -
- -      /* Chop skb frags into desc segments */
- -
- -      for (frag = skb_shinfo(skb)->frags; len_left; frag++) {
- -              offset = 0;
- -              left = skb_frag_size(frag);
- -              len_left -= left;
- -              nfrags--;
- -              stats->frags++;
- -
- -              while (left > 0) {
- -                      if (frag_left > 0) {
- -                              len = min(frag_left, left);
- -                              frag_left -= len;
- -                              addr = ionic_tx_map_frag(q, frag, offset, len);
- -                              if (dma_mapping_error(dev, addr))
- -                                      goto err_out_abort;
- -                              elem->addr = cpu_to_le64(addr);
- -                              elem->len = cpu_to_le16(len);
+ +              /* use fragments until we have enough to post a single descriptor */
+ +              while (seg_rem > 0) {
+ +                      /* if the fragment is exhausted then move to the next one */
+ +                      if (frag_rem == 0) {
+ +                              /* grab the next fragment */
+ +                              frag_addr = buf_info->dma_addr;
+ +                              frag_rem = buf_info->len;
+ +                              buf_info++;
+ +                      }
+ +                      chunk_len = min(frag_rem, seg_rem);
+ +                      if (!desc) {
+ +                              /* fill main descriptor */
+ +                              desc = desc_info->txq_desc;
+ +                              elem = desc_info->txq_sg_desc->elems;
+ +                              desc_addr = frag_addr;
+ +                              desc_len = chunk_len;
+ +                      } else {
+ +                              /* fill sg descriptor */
+ +                              elem->addr = cpu_to_le64(frag_addr);
+ +                              elem->len = cpu_to_le16(chunk_len);
                                 elem++;
                                 desc_nsge++;
- -                              left -= len;
- -                              offset += len;
- -                              if (nfrags > 0 && frag_left > 0)
- -                                      continue;
- -                              done = (nfrags == 0 && left == 0);
- -                              ionic_tx_tso_post(q, desc, skb, desc_addr,
- -                                                desc_nsge, desc_len,
- -                                                hdrlen, mss, outer_csum,
- -                                                vlan_tci, has_vlan,
- -                                                start, done);
- -                              total_pkts++;
- -                              total_bytes += start ? len : len + hdrlen;
- -                              desc = ionic_tx_tso_next(q, &elem);
- -                              start = false;
- -                      } else {
- -                              len = min(mss, left);
- -                              frag_left = mss - len;
- -                              desc_addr = ionic_tx_map_frag(q, frag,
- -                                                            offset, len);
- -                              if (dma_mapping_error(dev, desc_addr))
- -                                      goto err_out_abort;
- -                              desc_len = len;
- -                              desc_nsge = 0;
- -                              left -= len;
- -                              offset += len;
- -                              if (nfrags > 0 && frag_left > 0)
- -                                      continue;
- -                              done = (nfrags == 0 && left == 0);
- -                              ionic_tx_tso_post(q, desc, skb, desc_addr,
- -                                                desc_nsge, desc_len,
- -                                                hdrlen, mss, outer_csum,
- -                                                vlan_tci, has_vlan,
- -                                                start, done);
- -                              total_pkts++;
- -                              total_bytes += start ? len : len + hdrlen;
- -                              desc = ionic_tx_tso_next(q, &elem);
- -                              start = false;
                         }
+ +                      frag_addr += chunk_len;
+ +                      frag_rem -= chunk_len;
+ +                      tso_rem -= chunk_len;
+ +                      seg_rem -= chunk_len;
                 }
+ +              seg_rem = min(tso_rem, mss);
+ +              done = (tso_rem == 0);
+ +              /* post descriptor */
+ +              ionic_tx_tso_post(q, desc, skb,
+ +                                desc_addr, desc_nsge, desc_len,
+ +                                hdrlen, mss, outer_csum, vlan_tci, has_vlan,
+ +                                start, done);
+ +              start = false;
+ +              /* Buffer information is stored with the first tso descriptor */
+ +              desc_info = &q->info[q->head_idx];
+ +              desc_info->nbufs = 0;
         }
   
- -      stats->pkts += total_pkts;
- -      stats->bytes += total_bytes;
+ +      stats->pkts += DIV_ROUND_UP(len - hdrlen, mss);
+ +      stats->bytes += len;
         stats->tso++;
- -      stats->tso_bytes += total_bytes;
+ +      stats->tso_bytes = len;
   
         return 0;
- -
- -err_out_abort:
- -      while (rewind != q->head_idx) {
- -              rewind_desc_info = &q->info[rewind];
- -              ionic_tx_clean(q, rewind_desc_info, NULL, NULL);
- -              rewind = (rewind + 1) & (q->num_descs - 1);
- -      }
- -      q->head_idx = abort;
- -
- -      return -ENOMEM;
   }
   
- -static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb)
+ +static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
+ +                            struct ionic_desc_info *desc_info)
   {
- -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
+ +      struct ionic_txq_desc *desc = desc_info->txq_desc;
+ +      struct ionic_buf_info *buf_info = desc_info->bufs;
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct device *dev = q->lif->ionic->dev;
- -      dma_addr_t dma_addr;
         bool has_vlan;
         u8 flags = 0;
         bool encap;
@@@ -968,22 -961,23 +968,22 @@@
         has_vlan = !!skb_vlan_tag_present(skb);
         encap = skb->encapsulation;
   
- -      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
- -      if (dma_mapping_error(dev, dma_addr))
- -              return -ENOMEM;
- -
         flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
         flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
   
         cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_PARTIAL,
- -                                flags, skb_shinfo(skb)->nr_frags, dma_addr);
+ +                                flags, skb_shinfo(skb)->nr_frags,
+ +                                buf_info->dma_addr);
         desc->cmd = cpu_to_le64(cmd);
- -      desc->len = cpu_to_le16(skb_headlen(skb));
- -      desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
- -      desc->csum_offset = cpu_to_le16(skb->csum_offset);
+ +      desc->len = cpu_to_le16(buf_info->len);
         if (has_vlan) {
                 desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
                 stats->vlan_inserted++;
+ +      } else {
+ +              desc->vlan_tci = 0;
         }
+ +      desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
+ +      desc->csum_offset = cpu_to_le16(skb->csum_offset);
   
         if (skb_csum_is_sctp(skb))
                 stats->crc32_csum++;
@@@ -993,12 -987,12 +993,12 @@@
         return 0;
   }
   
- -static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb)
+ +static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
+ +                               struct ionic_desc_info *desc_info)
   {
- -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
+ +      struct ionic_txq_desc *desc = desc_info->txq_desc;
+ +      struct ionic_buf_info *buf_info = desc_info->bufs;
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct device *dev = q->lif->ionic->dev;
- -      dma_addr_t dma_addr;
         bool has_vlan;
         u8 flags = 0;
         bool encap;
@@@ -1007,66 -1001,67 +1007,66 @@@
         has_vlan = !!skb_vlan_tag_present(skb);
         encap = skb->encapsulation;
   
- -      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
- -      if (dma_mapping_error(dev, dma_addr))
- -              return -ENOMEM;
- -
         flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
         flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
   
         cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_NONE,
- -                                flags, skb_shinfo(skb)->nr_frags, dma_addr);
+ +                                flags, skb_shinfo(skb)->nr_frags,
+ +                                buf_info->dma_addr);
         desc->cmd = cpu_to_le64(cmd);
- -      desc->len = cpu_to_le16(skb_headlen(skb));
+ +      desc->len = cpu_to_le16(buf_info->len);
         if (has_vlan) {
                 desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
                 stats->vlan_inserted++;
+ +      } else {
+ +              desc->vlan_tci = 0;
         }
+ +      desc->csum_start = 0;
+ +      desc->csum_offset = 0;
   
         stats->csum_none++;
   
         return 0;
   }
   
- -static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb)
+ +static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
+ +                            struct ionic_desc_info *desc_info)
   {
- -      struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
- -      unsigned int len_left = skb->len - skb_headlen(skb);
+ +      struct ionic_txq_sg_desc *sg_desc = desc_info->txq_sg_desc;
+ +      struct ionic_buf_info *buf_info = &desc_info->bufs[1];
         struct ionic_txq_sg_elem *elem = sg_desc->elems;
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
- -      struct device *dev = q->lif->ionic->dev;
- -      dma_addr_t dma_addr;
- -      skb_frag_t *frag;
- -      u16 len;
+ +      unsigned int i;
   
- -      for (frag = skb_shinfo(skb)->frags; len_left; frag++, elem++) {
- -              len = skb_frag_size(frag);
- -              elem->len = cpu_to_le16(len);
- -              dma_addr = ionic_tx_map_frag(q, frag, 0, len);
- -              if (dma_mapping_error(dev, dma_addr))
- -                      return -ENOMEM;
- -              elem->addr = cpu_to_le64(dma_addr);
- -              len_left -= len;
- -              stats->frags++;
+ +      for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, buf_info++, elem++) {
+ +              elem->addr = cpu_to_le64(buf_info->dma_addr);
+ +              elem->len = cpu_to_le16(buf_info->len);
         }
   
+ +      stats->frags += skb_shinfo(skb)->nr_frags;
+ +
         return 0;
   }
   
   static int ionic_tx(struct ionic_queue *q, struct sk_buff *skb)
   {
+ +      struct ionic_desc_info *desc_info = &q->info[q->head_idx];
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
         int err;
   
+ +      if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
+ +              return -EIO;
+ +
         /* set up the initial descriptor */
         if (skb->ip_summed == CHECKSUM_PARTIAL)
- -              err = ionic_tx_calc_csum(q, skb);
+ +              err = ionic_tx_calc_csum(q, skb, desc_info);
         else
- -              err = ionic_tx_calc_no_csum(q, skb);
+ +              err = ionic_tx_calc_no_csum(q, skb, desc_info);
         if (err)
                 return err;
   
         /* add frags */
- -      err = ionic_tx_skb_frags(q, skb);
+ +      err = ionic_tx_skb_frags(q, skb, desc_info);
         if (err)
                 return err;
   
@@@ -1082,16 -1077,19 +1082,19 @@@
   
   static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb)
   {
- -      int sg_elems = q->lif->qtype_info[IONIC_QTYPE_TXQ].max_sg_elems;
         struct ionic_tx_stats *stats = q_to_tx_stats(q);
+       int ndescs;
         int err;
   
-       /* If TSO, need roundup(skb->len/mss) descs */
+       /* Each desc is mss long max, so a descriptor for each gso_seg */
         if (skb_is_gso(skb))
-               return (skb->len / skb_shinfo(skb)->gso_size) + 1;
+               ndescs = skb_shinfo(skb)->gso_segs;
+       else
+               ndescs = 1;
   
- -      if (skb_shinfo(skb)->nr_frags <= sg_elems)
+ +      /* If non-TSO, just need 1 desc and nr_frags sg elems */
+ +      if (skb_shinfo(skb)->nr_frags <= q->max_sg_elems)
-               return 1;
+               return ndescs;
   
         /* Too many frags, so linearize */
         err = skb_linearize(skb);
@@@ -1100,8 -1098,7 +1103,7 @@@
   
         stats->linearize++;
   
-       /* Need 1 desc and zero sg elems */
-       return 1;
+       return ndescs;
   }
   
   static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs)
diff --combined drivers/net/ethernet/realtek/r8169_main.c

index 66d10aa,581a92f..7a8bb7e
--- 1/drivers/net/ethernet/realtek/r8169_main.c
--- 2/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@@ -1586,10 -1586,12 +1586,10 @@@ DECLARE_RTL_COND(rtl_counters_cond
   
   static void rtl8169_do_counters(struct rtl8169_private *tp, u32 counter_cmd)
   {
- -      dma_addr_t paddr = tp->counters_phys_addr;
- -      u32 cmd;
+ +      u32 cmd = lower_32_bits(tp->counters_phys_addr);
   
- -      RTL_W32(tp, CounterAddrHigh, (u64)paddr >> 32);
+ +      RTL_W32(tp, CounterAddrHigh, upper_32_bits(tp->counters_phys_addr));
         rtl_pci_commit(tp);
- -      cmd = (u64)paddr & DMA_BIT_MASK(32);
         RTL_W32(tp, CounterAddrLow, cmd);
         RTL_W32(tp, CounterAddrLow, cmd | counter_cmd);
   
@@@ -1901,15 -1903,6 +1901,15 @@@ static int rtl8169_set_eee(struct net_d
         return ret;
   }
   
+ +static void rtl8169_get_ringparam(struct net_device *dev,
+ +                                struct ethtool_ringparam *data)
+ +{
+ +      data->rx_max_pending = NUM_RX_DESC;
+ +      data->rx_pending = NUM_RX_DESC;
+ +      data->tx_max_pending = NUM_TX_DESC;
+ +      data->tx_pending = NUM_TX_DESC;
+ +}
+ +
   static const struct ethtool_ops rtl8169_ethtool_ops = {
         .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
                                      ETHTOOL_COALESCE_MAX_FRAMES,
@@@ -1930,7 -1923,6 +1930,7 @@@
         .set_eee                = rtl8169_set_eee,
         .get_link_ksettings     = phy_ethtool_get_link_ksettings,
         .set_link_ksettings     = phy_ethtool_set_link_ksettings,
+ +      .get_ringparam          = rtl8169_get_ringparam,
   };
   
   static void rtl_enable_eee(struct rtl8169_private *tp)
@@@ -4654,6 -4646,9 +4654,9 @@@ static void rtl8169_down(struct rtl8169
   
         rtl8169_update_counters(tp);
   
+       pci_clear_master(tp->pci_dev);
+       rtl_pci_commit(tp);
+ 
         rtl8169_cleanup(tp, true);
   
         rtl_prepare_power_down(tp);
@@@ -4661,6 -4656,7 +4664,7 @@@
   
   static void rtl8169_up(struct rtl8169_private *tp)
   {
+       pci_set_master(tp->pci_dev);
         phy_resume(tp->phydev);
         rtl8169_init_phy(tp);
         napi_enable(&tp->napi);
@@@ -5315,8 -5311,6 +5319,6 @@@ static int rtl_init_one(struct pci_dev 
   
         rtl_hw_reset(tp);
   
-       pci_set_master(pdev);
- 
         rc = rtl_alloc_irq(tp);
         if (rc < 0) {
                 dev_err(&pdev->dev, "Can't allocate interrupt\n");
diff --combined drivers/net/ipa/ipa_cmd.c

index a0be25f,d73b03a..2ac6dd8
--- 1/drivers/net/ipa/ipa_cmd.c
--- 2/drivers/net/ipa/ipa_cmd.c
+++ b/drivers/net/ipa/ipa_cmd.c
@@@ -71,12 -71,13 +71,12 @@@ struct ipa_cmd_hw_hdr_init_local 
   
   /* IPA_CMD_REGISTER_WRITE */
   
- -/* For IPA v4.0+, this opcode gets modified with pipeline clear options */
- -
+ +/* For IPA v4.0+, the pipeline clear options are encoded in the opcode */
   #define REGISTER_WRITE_OPCODE_SKIP_CLEAR_FMASK                GENMASK(8, 8)
   #define REGISTER_WRITE_OPCODE_CLEAR_OPTION_FMASK      GENMASK(10, 9)
   
   struct ipa_cmd_register_write {
- -      __le16 flags;           /* Unused/reserved for IPA v3.5.1 */
+ +      __le16 flags;           /* Unused/reserved prior to IPA v4.0 */
         __le16 offset;
         __le32 value;
         __le32 value_mask;
@@@ -84,12 -85,12 +84,12 @@@
   };
   
   /* Field masks for ipa_cmd_register_write structure fields */
- -/* The next field is present for IPA v4.0 and above */
+ +/* The next field is present for IPA v4.0+ */
   #define REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK                GENMASK(14, 11)
- -/* The next field is present for IPA v3.5.1 only */
+ +/* The next field is not present for IPA v4.0+ */
   #define REGISTER_WRITE_FLAGS_SKIP_CLEAR_FMASK         GENMASK(15, 15)
   
- -/* The next field and its values are present for IPA v3.5.1 only */
+ +/* The next field and its values are not present for IPA v4.0+ */
   #define REGISTER_WRITE_CLEAR_OPTIONS_FMASK            GENMASK(1, 0)
   
   /* IPA_CMD_IP_PACKET_INIT */
@@@ -122,7 -123,7 +122,7 @@@ struct ipa_cmd_hw_dma_mem_mem 
   
   /* Field masks for ipa_cmd_hw_dma_mem_mem structure fields */
   #define DMA_SHARED_MEM_FLAGS_DIRECTION_FMASK          GENMASK(0, 0)
- -/* The next two fields are present for IPA v3.5.1 only. */
+ +/* The next two fields are not present for IPA v4.0+ */
   #define DMA_SHARED_MEM_FLAGS_SKIP_CLEAR_FMASK         GENMASK(1, 1)
   #define DMA_SHARED_MEM_FLAGS_CLEAR_OPTIONS_FMASK      GENMASK(3, 2)
   
@@@ -174,21 -175,23 +174,23 @@@ bool ipa_cmd_table_valid(struct ipa *ip
                             : field_max(IP_FLTRT_FLAGS_NHASH_ADDR_FMASK);
         if (mem->offset > offset_max ||
             ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             ipa->mem_offset, mem->offset, offset_max);
+               dev_err(dev, "IPv%c %s%s table region offset too large\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
+ 
                 return false;
         }
   
         if (mem->offset > ipa->mem_size ||
             mem->size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             mem->offset, mem->size, ipa->mem_size);
+               dev_err(dev, "IPv%c %s%s table region out of range\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, mem->size, ipa->mem_size);
+ 
                 return false;
         }
   
@@@ -204,22 -207,36 +206,36 @@@ static bool ipa_cmd_header_valid(struc
         u32 size_max;
         u32 size;
   
+       /* In ipa_cmd_hdr_init_local_add() we record the offset and size
+        * of the header table memory area.  Make sure the offset and size
+        * fit in the fields that need to hold them, and that the entire
+        * range is within the overall IPA memory range.
+        */
         offset_max = field_max(HDR_INIT_LOCAL_FLAGS_HDR_ADDR_FMASK);
         if (mem->offset > offset_max ||
             ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "header table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipa->mem_offset + mem->offset, offset_max);
+               dev_err(dev, "header table region offset too large\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
+ 
                 return false;
         }
   
         size_max = field_max(HDR_INIT_LOCAL_FLAGS_TABLE_SIZE_FMASK);
         size = ipa->mem[IPA_MEM_MODEM_HEADER].size;
         size += ipa->mem[IPA_MEM_AP_HEADER].size;
-       if (mem->offset > ipa->mem_size || size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "header table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             mem->offset, size, ipa->mem_size);
+ 
+       if (size > size_max) {
+               dev_err(dev, "header table region size too large\n");
+               dev_err(dev, "    (0x%04x > 0x%08x)\n", size, size_max);
+ 
+               return false;
+       }
+       if (size > ipa->mem_size || mem->offset > ipa->mem_size - size) {
+               dev_err(dev, "header table region out of range\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, size, ipa->mem_size);
+ 
                 return false;
         }
   
@@@ -236,12 -253,11 +252,12 @@@ static bool ipa_cmd_register_write_offs
         u32 bit_count;
   
         /* The maximum offset in a register_write immediate command depends
- -       * on the version of IPA.  IPA v3.5.1 supports a 16 bit offset, but
- -       * newer versions allow some additional high-order bits.
+ +       * on the version of IPA.  A 16 bit offset is always supported,
+ +       * but starting with IPA v4.0 some additional high-order bits are
+ +       * allowed.
          */
         bit_count = BITS_PER_BYTE * sizeof(payload->offset);
- -      if (ipa->version != IPA_VERSION_3_5_1)
+ +      if (ipa->version >= IPA_VERSION_4_0)
                 bit_count += hweight32(REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK);
         BUILD_BUG_ON(bit_count > 32);
         offset_max = ~0U >> (32 - bit_count);
@@@ -440,11 -456,7 +456,11 @@@ void ipa_cmd_register_write_add(struct 
         /* pipeline_clear_src_grp is not used */
         clear_option = clear_full ? pipeline_clear_full : pipeline_clear_hps;
   
- -      if (ipa->version != IPA_VERSION_3_5_1) {
+ +      /* IPA v4.0+ represents the pipeline clear options in the opcode.  It
+ +       * also supports a larger offset by encoding additional high-order
+ +       * bits in the payload flags field.
+ +       */
+ +      if (ipa->version >= IPA_VERSION_4_0) {
                 u16 offset_high;
                 u32 val;
   
diff --combined drivers/net/ipa/ipa_qmi.c

index 7b833e9,e594bf3..ccdb4a6
--- 1/drivers/net/ipa/ipa_qmi.c
--- 2/drivers/net/ipa/ipa_qmi.c
+++ b/drivers/net/ipa/ipa_qmi.c
@@@ -249,6 -249,7 +249,7 @@@ static const struct qmi_msg_handler ipa
                 .decoded_size   = IPA_QMI_DRIVER_INIT_COMPLETE_REQ_SZ,
                 .fn             = ipa_server_driver_init_complete,
         },
+       { },
   };
   
   /* Handle an INIT_DRIVER response message from the modem. */
@@@ -269,6 -270,7 +270,7 @@@ static const struct qmi_msg_handler ipa
                 .decoded_size   = IPA_QMI_INIT_DRIVER_RSP_SZ,
                 .fn             = ipa_client_init_driver,
         },
+       { },
   };
   
   /* Return a pointer to an init modem driver request structure, which contains
@@@ -377,8 -379,8 +379,8 @@@ init_modem_driver_req(struct ipa_qmi *i
   
         /* None of the stats fields are valid (IPA v4.0 and above) */
   
- -      if (ipa->version != IPA_VERSION_3_5_1) {
- -              mem = &ipa->mem[IPA_MEM_STATS_QUOTA];
+ +      if (ipa->version >= IPA_VERSION_4_0) {
+ +              mem = &ipa->mem[IPA_MEM_STATS_QUOTA_MODEM];
                 if (mem->size) {
                         req.hw_stats_quota_base_addr_valid = 1;
                         req.hw_stats_quota_base_addr =
diff --combined drivers/net/phy/phylink.c

index 12a047d,dc2800b..96d8e88
--- 1/drivers/net/phy/phylink.c
--- 2/drivers/net/phy/phylink.c
+++ b/drivers/net/phy/phylink.c
@@@ -271,9 -271,8 +271,9 @@@ static int phylink_parse_mode(struct ph
                 pl->cfg_link_an_mode = MLO_AN_FIXED;
         fwnode_handle_put(dn);
   
- -      if (fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
- -          strcmp(managed, "in-band-status") == 0) {
+ +      if ((fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
+ +           strcmp(managed, "in-band-status") == 0) ||
+ +          pl->config->ovr_an_inband) {
                 if (pl->cfg_link_an_mode == MLO_AN_FIXED) {
                         phylink_err(pl,
                                     "can't use both fixed-link and in-band-status\n");
@@@ -477,7 -476,7 +477,7 @@@ static void phylink_major_config(struc
                 err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode,
                                               state->interface);
                 if (err < 0)
-                       phylink_err(pl, "mac_prepare failed: %pe\n",
+                       phylink_err(pl, "mac_finish failed: %pe\n",
                                     ERR_PTR(err));
         }
   }
diff --combined include/linux/bpf.h

index a25730e,3625f01..39dce9d
--- 1/include/linux/bpf.h
--- 2/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@@ -21,6 -21,7 +21,7 @@@
   #include <linux/capability.h>
   #include <linux/sched/mm.h>
   #include <linux/slab.h>
+ #include <linux/percpu-refcount.h>
   
   struct bpf_verifier_env;
   struct bpf_verifier_log;
@@@ -39,7 -40,6 +40,7 @@@ struct bpf_local_storage
   struct bpf_local_storage_map;
   struct kobject;
   struct mem_cgroup;
+ +struct bpf_func_state;
   
   extern struct idr btf_idr;
   extern spinlock_t btf_idr_lock;
@@@ -118,9 -118,6 +119,9 @@@ struct bpf_map_ops 
                                            void *owner, u32 size);
         struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
   
+ +      /* Misc helpers.*/
+ +      int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
+ +
         /* map_meta_equal must be implemented for maps that can be
          * used as an inner map.  It is a runtime check to ensure
          * an inner map can be inserted to an outer map.
@@@ -133,13 -130,6 +134,13 @@@
         bool (*map_meta_equal)(const struct bpf_map *meta0,
                                const struct bpf_map *meta1);
   
+ +
+ +      int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
+ +                                            struct bpf_func_state *caller,
+ +                                            struct bpf_func_state *callee);
+ +      int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
+ +                                   void *callback_ctx, u64 flags);
+ +
         /* BTF name and id of struct allocated by map_alloc */
         const char * const map_btf_name;
         int *map_btf_id;
@@@ -306,8 -296,6 +307,8 @@@ enum bpf_arg_type 
         ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
         ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
         ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
+ +      ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
+ +      ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
         __BPF_ARG_TYPE_MAX,
   };
   
@@@ -424,8 -412,6 +425,8 @@@ enum bpf_reg_type 
         PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
         PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
         PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
+ +      PTR_TO_FUNC,             /* reg points to a bpf program function */
+ +      PTR_TO_MAP_KEY,          /* reg points to a map element key */
   };
   
   /* The information passed from prog-specific *_is_valid_access
@@@ -521,11 -507,6 +522,11 @@@ enum bpf_cgroup_storage_type 
    */
   #define MAX_BPF_FUNC_ARGS 12
   
+ +/* The maximum number of arguments passed through registers
+ + * a single function may have.
+ + */
+ +#define MAX_BPF_FUNC_REG_ARGS 5
+ +
   struct btf_func_model {
         u8 ret_size;
         u8 nr_args;
@@@ -576,7 -557,8 +577,8 @@@ struct bpf_tramp_progs 
    *      fentry = a set of program to run before calling original function
    *      fexit = a set of program to run after original function
    */
- int arch_prepare_bpf_trampoline(void *image, void *image_end,
+ struct bpf_tramp_image;
+ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
                                 const struct btf_func_model *m, u32 flags,
                                 struct bpf_tramp_progs *tprogs,
                                 void *orig_call);
@@@ -585,6 -567,8 +587,8 @@@ u64 notrace __bpf_prog_enter(struct bpf
   void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
   u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
   void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
+ void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
+ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
   
   struct bpf_ksym {
         unsigned long            start;
@@@ -603,6 -587,18 +607,18 @@@ enum bpf_tramp_prog_type 
         BPF_TRAMP_REPLACE, /* more than MAX */
   };
   
+ struct bpf_tramp_image {
+       void *image;
+       struct bpf_ksym ksym;
+       struct percpu_ref pcref;
+       void *ip_after_call;
+       void *ip_epilogue;
+       union {
+               struct rcu_head rcu;
+               struct work_struct work;
+       };
+ };
+ 
   struct bpf_trampoline {
         /* hlist for trampoline_table */
         struct hlist_node hlist;
@@@ -625,9 -621,8 +641,8 @@@
         /* Number of attached programs. A counter per kind. */
         int progs_cnt[BPF_TRAMP_MAX];
         /* Executable image of trampoline */
-       void *image;
+       struct bpf_tramp_image *cur_image;
         u64 selector;
-       struct bpf_ksym ksym;
   };
   
   struct bpf_attach_target_info {
@@@ -711,6 -706,8 +726,8 @@@ void bpf_image_ksym_add(void *data, str
   void bpf_image_ksym_del(struct bpf_ksym *ksym);
   void bpf_ksym_add(struct bpf_ksym *ksym);
   void bpf_ksym_del(struct bpf_ksym *ksym);
+ int bpf_jit_charge_modmem(u32 pages);
+ void bpf_jit_uncharge_modmem(u32 pages);
   #else
   static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
                                            struct bpf_trampoline *tr)
@@@ -807,7 -804,6 +824,6 @@@ struct bpf_prog_aux 
         bool func_proto_unreliable;
         bool sleepable;
         bool tail_call_reachable;
-       enum bpf_tramp_prog_type trampoline_prog_type;
         struct hlist_node tramp_hlist;
         /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
         const struct btf_type *attach_func_proto;
@@@ -1113,7 -1109,7 +1129,7 @@@ int bpf_prog_array_copy(struct bpf_prog
                 _ret;                                                   \
          })
   
- #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)        \
+ #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage)        \
         ({                                              \
                 struct bpf_prog_array_item *_item;      \
                 struct bpf_prog *_prog;                 \
@@@ -1126,7 -1122,8 +1142,8 @@@
                         goto _out;                      \
                 _item = &_array->items[0];              \
                 while ((_prog = READ_ONCE(_item->prog))) {              \
-                       bpf_cgroup_storage_set(_item->cgroup_storage);  \
+                       if (set_cg_storage)             \
+                               bpf_cgroup_storage_set(_item->cgroup_storage);  \
                         _ret &= func(_prog, ctx);       \
                         _item++;                        \
                 }                                       \
@@@ -1173,10 -1170,10 +1190,10 @@@ _out:                                                        
         })
   
   #define BPF_PROG_RUN_ARRAY(array, ctx, func)          \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
   
   #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)    \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
   
   #ifdef CONFIG_BPF_SYSCALL
   DECLARE_PER_CPU(int, bpf_prog_active);
@@@ -1400,10 -1397,6 +1417,10 @@@ void bpf_iter_map_show_fdinfo(const str
   int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                 struct bpf_link_info *info);
   
+ +int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ +                                 struct bpf_func_state *caller,
+ +                                 struct bpf_func_state *callee);
+ +
   int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
   int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
   int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
@@@ -1453,9 -1446,9 +1470,9 @@@ struct btf *bpf_get_btf_vmlinux(void)
   /* Map specifics */
   struct xdp_buff;
   struct sk_buff;
+ +struct bpf_dtab_netdev;
+ +struct bpf_cpu_map_entry;
   
- -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
- -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
   void __dev_flush(void);
   int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
                     struct net_device *dev_rx);
@@@ -1465,6 -1458,7 +1482,6 @@@ int dev_map_generic_redirect(struct bpf
                              struct bpf_prog *xdp_prog);
   bool dev_map_can_have_prog(struct bpf_map *map);
   
- -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
   void __cpu_map_flush(void);
   int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
                     struct net_device *dev_rx);
@@@ -1493,9 -1487,6 +1510,9 @@@ int bpf_prog_test_run_flow_dissector(st
   int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                              const union bpf_attr *kattr,
                              union bpf_attr __user *uattr);
+ +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+ +                              const union bpf_attr *kattr,
+ +                              union bpf_attr __user *uattr);
   bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                     const struct bpf_prog *prog,
                     struct bpf_insn_access_aux *info);
@@@ -1525,7 -1516,6 +1542,7 @@@ struct bpf_prog *bpf_prog_by_id(u32 id)
   struct bpf_link *bpf_link_by_id(u32 id);
   
   const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
+ +void bpf_task_storage_free(struct task_struct *task);
   #else /* !CONFIG_BPF_SYSCALL */
   static inline struct bpf_prog *bpf_prog_get(u32 ufd)
   {
@@@ -1595,6 -1585,17 +1612,6 @@@ static inline int bpf_obj_get_user(cons
         return -EOPNOTSUPP;
   }
   
- -static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
- -                                                     u32 key)
- -{
- -      return NULL;
- -}
- -
- -static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
- -                                                           u32 key)
- -{
- -      return NULL;
- -}
   static inline bool dev_map_can_have_prog(struct bpf_map *map)
   {
         return false;
@@@ -1606,7 -1607,6 +1623,7 @@@ static inline void __dev_flush(void
   
   struct xdp_buff;
   struct bpf_dtab_netdev;
+ +struct bpf_cpu_map_entry;
   
   static inline
   int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
@@@ -1631,6 -1631,12 +1648,6 @@@ static inline int dev_map_generic_redir
         return 0;
   }
   
- -static inline
- -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
- -{
- -      return NULL;
- -}
- -
   static inline void __cpu_map_flush(void)
   {
   }
@@@ -1681,13 -1687,6 +1698,13 @@@ static inline int bpf_prog_test_run_flo
         return -ENOTSUPP;
   }
   
+ +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
+ +                                            const union bpf_attr *kattr,
+ +                                            union bpf_attr __user *uattr)
+ +{
+ +      return -ENOTSUPP;
+ +}
+ +
   static inline void bpf_map_put(struct bpf_map *map)
   {
   }
@@@ -1702,10 -1701,6 +1719,10 @@@ bpf_base_func_proto(enum bpf_func_id fu
   {
         return NULL;
   }
+ +
+ +static inline void bpf_task_storage_free(struct task_struct *task)
+ +{
+ +}
   #endif /* CONFIG_BPF_SYSCALL */
   
   void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@@ -1790,24 -1785,22 +1807,24 @@@ static inline void bpf_map_offload_map_
   }
   #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
   
- -#if defined(CONFIG_BPF_STREAM_PARSER)
- -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
- -                       struct bpf_prog *old, u32 which);
+ +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
   int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
   int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
   int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
   void sock_map_unhash(struct sock *sk);
   void sock_map_close(struct sock *sk, long timeout);
+ +
+ +void bpf_sk_reuseport_detach(struct sock *sk);
+ +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
+ +                                     void *value);
+ +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
+ +                                     void *value, u64 map_flags);
   #else
- -static inline int sock_map_prog_update(struct bpf_map *map,
- -                                     struct bpf_prog *prog,
- -                                     struct bpf_prog *old, u32 which)
+ +static inline void bpf_sk_reuseport_detach(struct sock *sk)
   {
- -      return -EOPNOTSUPP;
   }
   
+ +#ifdef CONFIG_BPF_SYSCALL
   static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                        struct bpf_prog *prog)
   {
@@@ -1825,7 -1818,20 +1842,7 @@@ static inline int sock_map_update_elem_
   {
         return -EOPNOTSUPP;
   }
- -#endif /* CONFIG_BPF_STREAM_PARSER */
   
- -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
- -void bpf_sk_reuseport_detach(struct sock *sk);
- -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
- -                                     void *value);
- -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
- -                                     void *value, u64 map_flags);
- -#else
- -static inline void bpf_sk_reuseport_detach(struct sock *sk)
- -{
- -}
- -
- -#ifdef CONFIG_BPF_SYSCALL
   static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                      void *key, void *value)
   {
@@@ -1897,9 -1903,6 +1914,9 @@@ extern const struct bpf_func_proto bpf_
   extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
   extern const struct bpf_func_proto bpf_sock_from_file_proto;
   extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
+ +extern const struct bpf_func_proto bpf_task_storage_get_proto;
+ +extern const struct bpf_func_proto bpf_task_storage_delete_proto;
+ +extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
   
   const struct bpf_func_proto *bpf_tracing_func_proto(
         enum bpf_func_id func_id, const struct bpf_prog *prog);
diff --combined include/linux/netdevice.h

index 02fa1da,87a5d18..f57b70f
--- 1/include/linux/netdevice.h
--- 2/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@@ -360,6 -360,7 +360,7 @@@ enum 
         NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
         NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
         NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
+       NAPI_STATE_SCHED_THREADED,      /* Napi is currently scheduled in threaded mode */
   };
   
   enum {
@@@ -372,6 -373,7 +373,7 @@@
         NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
         NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
         NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
+       NAPIF_STATE_SCHED_THREADED      = BIT(NAPI_STATE_SCHED_THREADED),
   };
   
   enum gro_result {
@@@ -754,13 -756,6 +756,13 @@@ struct rx_queue_attribute 
                          const char *buf, size_t len);
   };
   
+ +/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
+ +enum xps_map_type {
+ +      XPS_CPUS = 0,
+ +      XPS_RXQS,
+ +      XPS_MAPS_MAX,
+ +};
+ +
   #ifdef CONFIG_XPS
   /*
    * This structure holds an XPS map which can be of variable length.  The
@@@ -778,19 -773,9 +780,19 @@@ struct xps_map 
   
   /*
    * This structure holds all XPS maps for device.  Maps are indexed by CPU.
+ + *
+ + * We keep track of the number of cpus/rxqs used when the struct is allocated,
+ + * in nr_ids. This will help not accessing out-of-bound memory.
+ + *
+ + * We keep track of the number of traffic classes used when the struct is
+ + * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
+ + * not crossing its upper bound, as the original dev->num_tc can be updated in
+ + * the meantime.
    */
   struct xps_dev_maps {
         struct rcu_head rcu;
+ +      unsigned int nr_ids;
+ +      s16 num_tc;
         struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
   };
   
@@@ -848,59 -833,6 +850,59 @@@ typedef u16 (*select_queue_fallback_t)(
                                        struct sk_buff *skb,
                                        struct net_device *sb_dev);
   
+ +enum net_device_path_type {
+ +      DEV_PATH_ETHERNET = 0,
+ +      DEV_PATH_VLAN,
+ +      DEV_PATH_BRIDGE,
+ +      DEV_PATH_PPPOE,
+ +      DEV_PATH_DSA,
+ +};
+ +
+ +struct net_device_path {
+ +      enum net_device_path_type       type;
+ +      const struct net_device         *dev;
+ +      union {
+ +              struct {
+ +                      u16             id;
+ +                      __be16          proto;
+ +                      u8              h_dest[ETH_ALEN];
+ +              } encap;
+ +              struct {
+ +                      enum {
+ +                              DEV_PATH_BR_VLAN_KEEP,
+ +                              DEV_PATH_BR_VLAN_TAG,
+ +                              DEV_PATH_BR_VLAN_UNTAG,
+ +                              DEV_PATH_BR_VLAN_UNTAG_HW,
+ +                      }               vlan_mode;
+ +                      u16             vlan_id;
+ +                      __be16          vlan_proto;
+ +              } bridge;
+ +              struct {
+ +                      int port;
+ +                      u16 proto;
+ +              } dsa;
+ +      };
+ +};
+ +
+ +#define NET_DEVICE_PATH_STACK_MAX     5
+ +#define NET_DEVICE_PATH_VLAN_MAX      2
+ +
+ +struct net_device_path_stack {
+ +      int                     num_paths;
+ +      struct net_device_path  path[NET_DEVICE_PATH_STACK_MAX];
+ +};
+ +
+ +struct net_device_path_ctx {
+ +      const struct net_device *dev;
+ +      const u8                *daddr;
+ +
+ +      int                     num_vlans;
+ +      struct {
+ +              u16             id;
+ +              __be16          proto;
+ +      } vlan[NET_DEVICE_PATH_VLAN_MAX];
+ +};
+ +
   enum tc_setup_type {
         TC_SETUP_QDISC_MQPRIO,
         TC_SETUP_CLSU32,
@@@ -1335,8 -1267,6 +1337,8 @@@ struct netdev_net_notifier 
    * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
    *    If a device is paired with a peer device, return the peer instance.
    *    The caller must be under RCU read context.
+ + * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
+ + *     Get the forwarding path to reach the real device from the HW destination address
    */
   struct net_device_ops {
         int                     (*ndo_init)(struct net_device *dev);
@@@ -1543,8 -1473,6 +1545,8 @@@
         int                     (*ndo_tunnel_ctl)(struct net_device *dev,
                                                   struct ip_tunnel_parm *p, int cmd);
         struct net_device *     (*ndo_get_peer_dev)(struct net_device *dev);
+ +      int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
+ +                                                         struct net_device_path *path);
   };
   
   /**
@@@ -1592,8 -1520,6 +1594,8 @@@
    * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
    * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
    * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
+ + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
+ + *    skb_headlen(skb) == 0 (data starts from frag0)
    */
   enum netdev_priv_flags {
         IFF_802_1Q_VLAN                 = 1<<0,
@@@ -1627,7 -1553,6 +1629,7 @@@
         IFF_FAILOVER_SLAVE              = 1<<28,
         IFF_L3MDEV_RX_HANDLER           = 1<<29,
         IFF_LIVE_RENAME_OK              = 1<<30,
+ +      IFF_TX_SKB_NO_LINEAR            = 1<<31,
   };
   
   #define IFF_802_1Q_VLAN                       IFF_802_1Q_VLAN
@@@ -1654,14 -1579,12 +1656,14 @@@
   #define IFF_L3MDEV_SLAVE              IFF_L3MDEV_SLAVE
   #define IFF_TEAM                      IFF_TEAM
   #define IFF_RXFH_CONFIGURED           IFF_RXFH_CONFIGURED
+ +#define IFF_PHONY_HEADROOM            IFF_PHONY_HEADROOM
   #define IFF_MACSEC                    IFF_MACSEC
   #define IFF_NO_RX_HANDLER             IFF_NO_RX_HANDLER
   #define IFF_FAILOVER                  IFF_FAILOVER
   #define IFF_FAILOVER_SLAVE            IFF_FAILOVER_SLAVE
   #define IFF_L3MDEV_RX_HANDLER         IFF_L3MDEV_RX_HANDLER
   #define IFF_LIVE_RENAME_OK            IFF_LIVE_RENAME_OK
+ +#define IFF_TX_SKB_NO_LINEAR          IFF_TX_SKB_NO_LINEAR
   
   /* Specifies the type of the struct net_device::ml_priv pointer */
   enum netdev_ml_priv_type {
@@@ -1837,7 -1760,8 +1839,7 @@@
    *    @tx_queue_len:          Max frames per queue allowed
    *    @tx_global_lock:        XXX: need comments on this one
    *    @xdp_bulkq:             XDP device bulk queue
- - *    @xps_cpus_map:          all CPUs map for XPS device
- - *    @xps_rxqs_map:          all RXQs map for XPS device
+ + *    @xps_maps:              all CPUs/RXQs maps for XPS device
    *
    *    @xps_maps:      XXX: need comments on this one
    *    @miniq_egress:          clsact qdisc specific data for
@@@ -1849,7 -1773,6 +1851,7 @@@
    *
    *    @proto_down_reason:     reason a netdev interface is held down
    *    @pcpu_refcnt:           Number of references to this device
+ + *    @dev_refcnt:            Number of references to this device
    *    @todo_list:             Delayed register/unregister
    *    @link_watch_list:       XXX: need comments on this one
    *
@@@ -2134,7 -2057,8 +2136,7 @@@ struct net_device 
         struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
   
   #ifdef CONFIG_XPS
- -      struct xps_dev_maps __rcu *xps_cpus_map;
- -      struct xps_dev_maps __rcu *xps_rxqs_map;
+ +      struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
   #endif
   #ifdef CONFIG_NET_CLS_ACT
         struct mini_Qdisc __rcu *miniq_egress;
@@@ -2150,12 -2074,7 +2152,12 @@@
         u32                     proto_down_reason;
   
         struct list_head        todo_list;
+ +
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         int __percpu            *pcpu_refcnt;
+ +#else
+ +      refcount_t              dev_refcnt;
+ +#endif
   
         struct list_head        link_watch_list;
   
@@@ -2927,8 -2846,6 +2929,8 @@@ void dev_remove_offload(struct packet_o
   
   int dev_get_iflink(const struct net_device *dev);
   int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
+ +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ +                        struct net_device_path_stack *stack);
   struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                       unsigned short mask);
   struct net_device *dev_get_by_name(struct net *net, const char *name);
@@@ -3506,24 -3423,6 +3508,24 @@@ netif_xmit_frozen_or_drv_stopped(const 
         return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
   }
   
+ +/**
+ + *    netdev_queue_set_dql_min_limit - set dql minimum limit
+ + *    @dev_queue: pointer to transmit queue
+ + *    @min_limit: dql minimum limit
+ + *
+ + * Forces xmit_more() to return true until the minimum threshold
+ + * defined by @min_limit is reached (or until the tx queue is
+ + * empty). Warning: to be use with care, misuse will impact the
+ + * latency.
+ + */
+ +static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
+ +                                                unsigned int min_limit)
+ +{
+ +#ifdef CONFIG_BQL
+ +      dev_queue->dql.min_limit = min_limit;
+ +#endif
+ +}
+ +
   /**
    *    netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
    *    @dev_queue: pointer to transmit queue
@@@ -3789,7 -3688,7 +3791,7 @@@ static inline void netif_wake_subqueue(
   int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                         u16 index);
   int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- -                        u16 index, bool is_rxqs_map);
+ +                        u16 index, enum xps_map_type type);
   
   /**
    *    netif_attr_test_mask - Test a CPU or Rx queue set in a mask
@@@ -3884,7 -3783,7 +3886,7 @@@ static inline int netif_set_xps_queue(s
   
   static inline int __netif_set_xps_queue(struct net_device *dev,
                                         const unsigned long *mask,
- -                                      u16 index, bool is_rxqs_map)
+ +                                      u16 index, enum xps_map_type type)
   {
         return 0;
   }
@@@ -4127,11 -4026,7 +4129,11 @@@ void netdev_run_todo(void)
    */
   static inline void dev_put(struct net_device *dev)
   {
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         this_cpu_dec(*dev->pcpu_refcnt);
+ +#else
+ +      refcount_dec(&dev->dev_refcnt);
+ +#endif
   }
   
   /**
@@@ -4142,11 -4037,7 +4144,11 @@@
    */
   static inline void dev_hold(struct net_device *dev)
   {
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         this_cpu_inc(*dev->pcpu_refcnt);
+ +#else
+ +      refcount_inc(&dev->dev_refcnt);
+ +#endif
   }
   
   /* Carrier loss detection, dial on demand. The functions netif_carrier_on
@@@ -4281,7 -4172,7 +4283,7 @@@ static inline bool netif_oper_up(const 
    *
    * Check if device has not been removed from system.
    */
- -static inline bool netif_device_present(struct net_device *dev)
+ +static inline bool netif_device_present(const struct net_device *dev)
   {
         return test_bit(__LINK_STATE_PRESENT, &dev->state);
   }
@@@ -4720,7 -4611,6 +4722,7 @@@ void dev_get_tstats64(struct net_devic
   
   extern int            netdev_max_backlog;
   extern int            netdev_tstamp_prequeue;
+ +extern int            netdev_unregister_timeout_secs;
   extern int            weight_p;
   extern int            dev_weight_rx_bias;
   extern int            dev_weight_tx_bias;
@@@ -5397,9 -5287,6 +5399,9 @@@ do {                                                            
   #define PTYPE_HASH_SIZE       (16)
   #define PTYPE_HASH_MASK       (PTYPE_HASH_SIZE - 1)
   
+ +extern struct list_head ptype_all __read_mostly;
+ +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
+ +
   extern struct net_device *blackhole_netdev;
   
   #endif        /* _LINUX_NETDEVICE_H */
diff --combined include/linux/skbuff.h

index ecc0296,f2c9ee7..c8def85
--- 1/include/linux/skbuff.h
--- 2/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@@ -285,6 -285,7 +285,7 @@@ struct nf_bridge_info 
   struct tc_skb_ext {
         __u32 chain;
         __u16 mru;
+       bool post_ct;
   };
   #endif
   
@@@ -656,7 -657,6 +657,7 @@@ typedef unsigned char *sk_buff_data_t
    *    @protocol: Packet protocol from driver
    *    @destructor: Destruct function
    *    @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
+ + *    @_sk_redir: socket redirection information for skmsg
    *    @_nfct: Associated connection, if any (with nfctinfo bits)
    *    @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
    *    @skb_iif: ifindex of device we arrived on
@@@ -756,9 -756,6 +757,9 @@@ struct sk_buff 
                         void            (*destructor)(struct sk_buff *skb);
                 };
                 struct list_head        tcp_tsorted_anchor;
+ +#ifdef CONFIG_NET_SOCK_MSG
+ +              unsigned long           _sk_redir;
+ +#endif
         };
   
   #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@@ -1140,7 -1137,7 +1141,7 @@@ static inline bool skb_fclone_busy(cons
   
         return skb->fclone == SKB_FCLONE_ORIG &&
                refcount_read(&fclones->fclone_ref) > 1 &&
- -             fclones->skb2.sk == sk;
+ +             READ_ONCE(fclones->skb2.sk) == sk;
   }
   
   /**
@@@ -1292,10 -1289,10 +1293,10 @@@ __skb_set_sw_hash(struct sk_buff *skb, 
   void __skb_get_hash(struct sk_buff *skb);
   u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
   u32 skb_get_poff(const struct sk_buff *skb);
- -u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ +u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                    const struct flow_keys_basic *keys, int hlen);
   __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- -                          void *data, int hlen_proto);
+ +                          const void *data, int hlen_proto);
   
   static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
                                         int thoff, u8 ip_proto)
@@@ -1314,8 -1311,9 +1315,8 @@@ bool bpf_flow_dissect(struct bpf_prog *
   bool __skb_flow_dissect(const struct net *net,
                         const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
- -                      void *target_container,
- -                      void *data, __be16 proto, int nhoff, int hlen,
- -                      unsigned int flags);
+ +                      void *target_container, const void *data,
+ +                      __be16 proto, int nhoff, int hlen, unsigned int flags);
   
   static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                     struct flow_dissector *flow_dissector,
@@@ -1337,9 -1335,9 +1338,9 @@@ static inline bool skb_flow_dissect_flo
   static inline bool
   skb_flow_dissect_flow_keys_basic(const struct net *net,
                                  const struct sk_buff *skb,
- -                               struct flow_keys_basic *flow, void *data,
- -                               __be16 proto, int nhoff, int hlen,
- -                               unsigned int flags)
+ +                               struct flow_keys_basic *flow,
+ +                               const void *data, __be16 proto,
+ +                               int nhoff, int hlen, unsigned int flags)
   {
         memset(flow, 0, sizeof(*flow));
         return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
@@@ -3677,13 -3675,14 +3678,13 @@@ __wsum skb_checksum(const struct sk_buf
                     __wsum csum);
   
   static inline void * __must_check
- -__skb_header_pointer(const struct sk_buff *skb, int offset,
- -                   int len, void *data, int hlen, void *buffer)
+ +__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
+ +                   const void *data, int hlen, void *buffer)
   {
- -      if (hlen - offset >= len)
- -              return data + offset;
+ +      if (likely(hlen - offset >= len))
+ +              return (void *)data + offset;
   
- -      if (!skb ||
- -          skb_copy_bits(skb, offset, buffer, len) < 0)
+ +      if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                 return NULL;
   
         return buffer;
diff --combined include/net/netfilter/nf_tables.h

index 67bc36f,5aaced6..0cef5ad
--- 1/include/net/netfilter/nf_tables.h
--- 2/include/net/netfilter/nf_tables.h
+++ b/include/net/netfilter/nf_tables.h
@@@ -1498,16 -1498,13 +1498,16 @@@ struct nft_trans_chain 
   
   struct nft_trans_table {
         bool                            update;
- -      bool                            enable;
+ +      u8                              state;
+ +      u32                             flags;
   };
   
   #define nft_trans_table_update(trans) \
         (((struct nft_trans_table *)trans->data)->update)
- -#define nft_trans_table_enable(trans) \
- -      (((struct nft_trans_table *)trans->data)->enable)
+ +#define nft_trans_table_state(trans)  \
+ +      (((struct nft_trans_table *)trans->data)->state)
+ +#define nft_trans_table_flags(trans)  \
+ +      (((struct nft_trans_table *)trans->data)->flags)
   
   struct nft_trans_elem {
         struct nft_set                  *set;
@@@ -1539,6 -1536,7 +1539,7 @@@ struct nft_trans_flowtable 
         struct nft_flowtable            *flowtable;
         bool                            update;
         struct list_head                hook_list;
+       u32                             flags;
   };
   
   #define nft_trans_flowtable(trans)    \
@@@ -1547,6 -1545,8 +1548,8 @@@
         (((struct nft_trans_flowtable *)trans->data)->update)
   #define nft_trans_flowtable_hooks(trans)      \
         (((struct nft_trans_flowtable *)trans->data)->hook_list)
+ #define nft_trans_flowtable_flags(trans)      \
+       (((struct nft_trans_flowtable *)trans->data)->flags)
   
   int __init nft_chain_filter_init(void);
   void nft_chain_filter_fini(void);
diff --combined include/net/nexthop.h

index ba94868,a10a319..28145f7
--- 1/include/net/nexthop.h
--- 2/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@@ -40,12 -40,6 +40,12 @@@ struct nh_config 
   
         struct nlattr   *nh_grp;
         u16             nh_grp_type;
+ +      u16             nh_grp_res_num_buckets;
+ +      unsigned long   nh_grp_res_idle_timer;
+ +      unsigned long   nh_grp_res_unbalanced_timer;
+ +      bool            nh_grp_res_has_num_buckets;
+ +      bool            nh_grp_res_has_idle_timer;
+ +      bool            nh_grp_res_has_unbalanced_timer;
   
         struct nlattr   *nh_encap;
         u16             nh_encap_type;
@@@ -69,32 -63,6 +69,32 @@@ struct nh_info 
         };
   };
   
+ +struct nh_res_bucket {
+ +      struct nh_grp_entry __rcu *nh_entry;
+ +      atomic_long_t           used_time;
+ +      unsigned long           migrated_time;
+ +      bool                    occupied;
+ +      u8                      nh_flags;
+ +};
+ +
+ +struct nh_res_table {
+ +      struct net              *net;
+ +      u32                     nhg_id;
+ +      struct delayed_work     upkeep_dw;
+ +
+ +      /* List of NHGEs that have too few buckets ("uw" for underweight).
+ +       * Reclaimed buckets will be given to entries in this list.
+ +       */
+ +      struct list_head        uw_nh_entries;
+ +      unsigned long           unbalanced_since;
+ +
+ +      u32                     idle_timer;
+ +      u32                     unbalanced_timer;
+ +
+ +      u16                     num_nh_buckets;
+ +      struct nh_res_bucket    nh_buckets[];
+ +};
+ +
   struct nh_grp_entry {
         struct nexthop  *nh;
         u8              weight;
@@@ -103,13 -71,6 +103,13 @@@
                 struct {
                         atomic_t        upper_bound;
                 } mpath;
+ +              struct {
+ +                      /* Member on uw_nh_entries. */
+ +                      struct list_head        uw_nh_entry;
+ +
+ +                      u16                     count_buckets;
+ +                      u16                     wants_buckets;
+ +              } res;
         };
   
         struct list_head nh_list;
@@@ -119,13 -80,9 +119,13 @@@
   struct nh_group {
         struct nh_group         *spare; /* spare group for removals */
         u16                     num_nh;
+ +      bool                    is_multipath;
         bool                    mpath;
+ +      bool                    resilient;
         bool                    fdb_nh;
         bool                    has_v4;
+ +
+ +      struct nh_res_table __rcu *res_table;
         struct nh_grp_entry     nh_entries[];
   };
   
@@@ -155,15 -112,11 +155,15 @@@ struct nexthop 
   enum nexthop_event_type {
         NEXTHOP_EVENT_DEL,
         NEXTHOP_EVENT_REPLACE,
+ +      NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
+ +      NEXTHOP_EVENT_BUCKET_REPLACE,
   };
   
   enum nh_notifier_info_type {
         NH_NOTIFIER_INFO_TYPE_SINGLE,
         NH_NOTIFIER_INFO_TYPE_GRP,
+ +      NH_NOTIFIER_INFO_TYPE_RES_TABLE,
+ +      NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
   };
   
   struct nh_notifier_single_info {
@@@ -190,19 -143,6 +190,19 @@@ struct nh_notifier_grp_info 
         struct nh_notifier_grp_entry_info nh_entries[];
   };
   
+ +struct nh_notifier_res_bucket_info {
+ +      u16 bucket_index;
+ +      unsigned int idle_timer_ms;
+ +      bool force;
+ +      struct nh_notifier_single_info old_nh;
+ +      struct nh_notifier_single_info new_nh;
+ +};
+ +
+ +struct nh_notifier_res_table_info {
+ +      u16 num_nh_buckets;
+ +      struct nh_notifier_single_info nhs[];
+ +};
+ +
   struct nh_notifier_info {
         struct net *net;
         struct netlink_ext_ack *extack;
@@@ -211,8 -151,6 +211,8 @@@
         union {
                 struct nh_notifier_single_info *nh;
                 struct nh_notifier_grp_info *nh_grp;
+ +              struct nh_notifier_res_table_info *nh_res_table;
+ +              struct nh_notifier_res_bucket_info *nh_res_bucket;
         };
   };
   
@@@ -220,10 -158,6 +220,10 @@@ int register_nexthop_notifier(struct ne
                               struct netlink_ext_ack *extack);
   int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
   void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
+ +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
+ +                               bool offload, bool trap);
+ +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
+ +                                   unsigned long *activity);
   
   /* caller is holding rcu or rtnl; no reference taken to nexthop */
   struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
@@@ -278,7 -212,7 +278,7 @@@ static inline bool nexthop_is_multipath
                 struct nh_group *nh_grp;
   
                 nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- -              return nh_grp->mpath;
+ +              return nh_grp->is_multipath;
         }
         return false;
   }
@@@ -293,7 -227,7 +293,7 @@@ static inline unsigned int nexthop_num_
                 struct nh_group *nh_grp;
   
                 nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- -              if (nh_grp->mpath)
+ +              if (nh_grp->is_multipath)
                         rc = nh_grp->num_nh;
         }
   
@@@ -374,7 -308,7 +374,7 @@@ struct fib_nh_common *nexthop_fib_nhc(s
                 struct nh_group *nh_grp;
   
                 nh_grp = rcu_dereference_rtnl(nh->nh_grp);
- -              if (nh_grp->mpath) {
+ +              if (nh_grp->is_multipath) {
                         nh = nexthop_mpath_select(nh_grp, nhsel);
                         if (!nh)
                                 return NULL;
@@@ -476,6 -410,7 +476,7 @@@ static inline struct fib_nh *fib_info_n
   int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                        struct netlink_ext_ack *extack);
   
+ /* Caller should either hold rcu_read_lock(), or RTNL. */
   static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
   {
         struct nh_info *nhi;
@@@ -496,6 -431,29 +497,29 @@@
         return NULL;
   }
   
+ /* Variant of nexthop_fib6_nh().
+  * Caller should either hold rcu_read_lock_bh(), or RTNL.
+  */
+ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
+ {
+       struct nh_info *nhi;
+ 
+       if (nh->is_group) {
+               struct nh_group *nh_grp;
+ 
+               nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+               nh = nexthop_mpath_select(nh_grp, 0);
+               if (!nh)
+                       return NULL;
+       }
+ 
+       nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+       if (nhi->family == AF_INET6)
+               return &nhi->fib6_nh;
+ 
+       return NULL;
+ }
+ 
   static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
   {
         struct fib6_nh *fib6_nh;
diff --combined include/uapi/linux/bpf.h

index 2d3036e,4ba4ef0..008edc1
--- 1/include/uapi/linux/bpf.h
--- 2/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@@ -93,717 -93,7 +93,717 @@@ union bpf_iter_link_info 
         } map;
   };
   
- -/* BPF syscall commands, see bpf(2) man-page for details. */
+ +/* BPF syscall commands, see bpf(2) man-page for more details. */
+ +/**
+ + * DOC: eBPF Syscall Preamble
+ + *
+ + * The operation to be performed by the **bpf**\ () system call is determined
+ + * by the *cmd* argument. Each operation takes an accompanying argument,
+ + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
+ + * below). The size argument is the size of the union pointed to by *attr*.
+ + */
+ +/**
+ + * DOC: eBPF Syscall Commands
+ + *
+ + * BPF_MAP_CREATE
+ + *    Description
+ + *            Create a map and return a file descriptor that refers to the
+ + *            map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
+ + *            is automatically enabled for the new file descriptor.
+ + *
+ + *            Applying **close**\ (2) to the file descriptor returned by
+ + *            **BPF_MAP_CREATE** will delete the map (but see NOTES).
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_MAP_LOOKUP_ELEM
+ + *    Description
+ + *            Look up an element with a given *key* in the map referred to
+ + *            by the file descriptor *map_fd*.
+ + *
+ + *            The *flags* argument may be specified as one of the
+ + *            following:
+ + *
+ + *            **BPF_F_LOCK**
+ + *                    Look up the value of a spin-locked map without
+ + *                    returning the lock. This must be specified if the
+ + *                    elements contain a spinlock.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_MAP_UPDATE_ELEM
+ + *    Description
+ + *            Create or update an element (key/value pair) in a specified map.
+ + *
+ + *            The *flags* argument should be specified as one of the
+ + *            following:
+ + *
+ + *            **BPF_ANY**
+ + *                    Create a new element or update an existing element.
+ + *            **BPF_NOEXIST**
+ + *                    Create a new element only if it did not exist.
+ + *            **BPF_EXIST**
+ + *                    Update an existing element.
+ + *            **BPF_F_LOCK**
+ + *                    Update a spin_lock-ed map element.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + *            May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
+ + *            **E2BIG**, **EEXIST**, or **ENOENT**.
+ + *
+ + *            **E2BIG**
+ + *                    The number of elements in the map reached the
+ + *                    *max_entries* limit specified at map creation time.
+ + *            **EEXIST**
+ + *                    If *flags* specifies **BPF_NOEXIST** and the element
+ + *                    with *key* already exists in the map.
+ + *            **ENOENT**
+ + *                    If *flags* specifies **BPF_EXIST** and the element with
+ + *                    *key* does not exist in the map.
+ + *
+ + * BPF_MAP_DELETE_ELEM
+ + *    Description
+ + *            Look up and delete an element by key in a specified map.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_MAP_GET_NEXT_KEY
+ + *    Description
+ + *            Look up an element by key in a specified map and return the key
+ + *            of the next element. Can be used to iterate over all elements
+ + *            in the map.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + *            The following cases can be used to iterate over all elements of
+ + *            the map:
+ + *
+ + *            * If *key* is not found, the operation returns zero and sets
+ + *              the *next_key* pointer to the key of the first element.
+ + *            * If *key* is found, the operation returns zero and sets the
+ + *              *next_key* pointer to the key of the next element.
+ + *            * If *key* is the last element, returns -1 and *errno* is set
+ + *              to **ENOENT**.
+ + *
+ + *            May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
+ + *            **EINVAL** on error.
+ + *
+ + * BPF_PROG_LOAD
+ + *    Description
+ + *            Verify and load an eBPF program, returning a new file
+ + *            descriptor associated with the program.
+ + *
+ + *            Applying **close**\ (2) to the file descriptor returned by
+ + *            **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
+ + *
+ + *            The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
+ + *            automatically enabled for the new file descriptor.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_OBJ_PIN
+ + *    Description
+ + *            Pin an eBPF program or map referred by the specified *bpf_fd*
+ + *            to the provided *pathname* on the filesystem.
+ + *
+ + *            The *pathname* argument must not contain a dot (".").
+ + *
+ + *            On success, *pathname* retains a reference to the eBPF object,
+ + *            preventing deallocation of the object when the original
+ + *            *bpf_fd* is closed. This allow the eBPF object to live beyond
+ + *            **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
+ + *            process.
+ + *
+ + *            Applying **unlink**\ (2) or similar calls to the *pathname*
+ + *            unpins the object from the filesystem, removing the reference.
+ + *            If no other file descriptors or filesystem nodes refer to the
+ + *            same object, it will be deallocated (see NOTES).
+ + *
+ + *            The filesystem type for the parent directory of *pathname* must
+ + *            be **BPF_FS_MAGIC**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_OBJ_GET
+ + *    Description
+ + *            Open a file descriptor for the eBPF object pinned to the
+ + *            specified *pathname*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_PROG_ATTACH
+ + *    Description
+ + *            Attach an eBPF program to a *target_fd* at the specified
+ + *            *attach_type* hook.
+ + *
+ + *            The *attach_type* specifies the eBPF attachment point to
+ + *            attach the program to, and must be one of *bpf_attach_type*
+ + *            (see below).
+ + *
+ + *            The *attach_bpf_fd* must be a valid file descriptor for a
+ + *            loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
+ + *            or sock_ops type corresponding to the specified *attach_type*.
+ + *
+ + *            The *target_fd* must be a valid file descriptor for a kernel
+ + *            object which depends on the attach type of *attach_bpf_fd*:
+ + *
+ + *            **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ + *            **BPF_PROG_TYPE_CGROUP_SKB**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCK**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ + *            **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ + *            **BPF_PROG_TYPE_SOCK_OPS**
+ + *
+ + *                    Control Group v2 hierarchy with the eBPF controller
+ + *                    enabled. Requires the kernel to be compiled with
+ + *                    **CONFIG_CGROUP_BPF**.
+ + *
+ + *            **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ + *
+ + *                    Network namespace (eg /proc/self/ns/net).
+ + *
+ + *            **BPF_PROG_TYPE_LIRC_MODE2**
+ + *
+ + *                    LIRC device path (eg /dev/lircN). Requires the kernel
+ + *                    to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ + *
+ + *            **BPF_PROG_TYPE_SK_SKB**,
+ + *            **BPF_PROG_TYPE_SK_MSG**
+ + *
+ + *                    eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_PROG_DETACH
+ + *    Description
+ + *            Detach the eBPF program associated with the *target_fd* at the
+ + *            hook specified by *attach_type*. The program must have been
+ + *            previously attached using **BPF_PROG_ATTACH**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_PROG_TEST_RUN
+ + *    Description
+ + *            Run the eBPF program associated with the *prog_fd* a *repeat*
+ + *            number of times against a provided program context *ctx_in* and
+ + *            data *data_in*, and return the modified program context
+ + *            *ctx_out*, *data_out* (for example, packet data), result of the
+ + *            execution *retval*, and *duration* of the test run.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + *            **ENOSPC**
+ + *                    Either *data_size_out* or *ctx_size_out* is too small.
+ + *            **ENOTSUPP**
+ + *                    This command is not supported by the program type of
+ + *                    the program referred to by *prog_fd*.
+ + *
+ + * BPF_PROG_GET_NEXT_ID
+ + *    Description
+ + *            Fetch the next eBPF program currently loaded into the kernel.
+ + *
+ + *            Looks for the eBPF program with an id greater than *start_id*
+ + *            and updates *next_id* on success. If no other eBPF programs
+ + *            remain with ids higher than *start_id*, returns -1 and sets
+ + *            *errno* to **ENOENT**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, or when no id remains, -1
+ + *            is returned and *errno* is set appropriately.
+ + *
+ + * BPF_MAP_GET_NEXT_ID
+ + *    Description
+ + *            Fetch the next eBPF map currently loaded into the kernel.
+ + *
+ + *            Looks for the eBPF map with an id greater than *start_id*
+ + *            and updates *next_id* on success. If no other eBPF maps
+ + *            remain with ids higher than *start_id*, returns -1 and sets
+ + *            *errno* to **ENOENT**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, or when no id remains, -1
+ + *            is returned and *errno* is set appropriately.
+ + *
+ + * BPF_PROG_GET_FD_BY_ID
+ + *    Description
+ + *            Open a file descriptor for the eBPF program corresponding to
+ + *            *prog_id*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_MAP_GET_FD_BY_ID
+ + *    Description
+ + *            Open a file descriptor for the eBPF map corresponding to
+ + *            *map_id*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_OBJ_GET_INFO_BY_FD
+ + *    Description
+ + *            Obtain information about the eBPF object corresponding to
+ + *            *bpf_fd*.
+ + *
+ + *            Populates up to *info_len* bytes of *info*, which will be in
+ + *            one of the following formats depending on the eBPF object type
+ + *            of *bpf_fd*:
+ + *
+ + *            * **struct bpf_prog_info**
+ + *            * **struct bpf_map_info**
+ + *            * **struct bpf_btf_info**
+ + *            * **struct bpf_link_info**
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_PROG_QUERY
+ + *    Description
+ + *            Obtain information about eBPF programs associated with the
+ + *            specified *attach_type* hook.
+ + *
+ + *            The *target_fd* must be a valid file descriptor for a kernel
+ + *            object which depends on the attach type of *attach_bpf_fd*:
+ + *
+ + *            **BPF_PROG_TYPE_CGROUP_DEVICE**,
+ + *            **BPF_PROG_TYPE_CGROUP_SKB**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCK**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
+ + *            **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
+ + *            **BPF_PROG_TYPE_CGROUP_SYSCTL**,
+ + *            **BPF_PROG_TYPE_SOCK_OPS**
+ + *
+ + *                    Control Group v2 hierarchy with the eBPF controller
+ + *                    enabled. Requires the kernel to be compiled with
+ + *                    **CONFIG_CGROUP_BPF**.
+ + *
+ + *            **BPF_PROG_TYPE_FLOW_DISSECTOR**
+ + *
+ + *                    Network namespace (eg /proc/self/ns/net).
+ + *
+ + *            **BPF_PROG_TYPE_LIRC_MODE2**
+ + *
+ + *                    LIRC device path (eg /dev/lircN). Requires the kernel
+ + *                    to be compiled with **CONFIG_BPF_LIRC_MODE2**.
+ + *
+ + *            **BPF_PROG_QUERY** always fetches the number of programs
+ + *            attached and the *attach_flags* which were used to attach those
+ + *            programs. Additionally, if *prog_ids* is nonzero and the number
+ + *            of attached programs is less than *prog_cnt*, populates
+ + *            *prog_ids* with the eBPF program ids of the programs attached
+ + *            at *target_fd*.
+ + *
+ + *            The following flags may alter the result:
+ + *
+ + *            **BPF_F_QUERY_EFFECTIVE**
+ + *                    Only return information regarding programs which are
+ + *                    currently effective at the specified *target_fd*.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_RAW_TRACEPOINT_OPEN
+ + *    Description
+ + *            Attach an eBPF program to a tracepoint *name* to access kernel
+ + *            internal arguments of the tracepoint in their raw form.
+ + *
+ + *            The *prog_fd* must be a valid file descriptor associated with
+ + *            a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
+ + *
+ + *            No ABI guarantees are made about the content of tracepoint
+ + *            arguments exposed to the corresponding eBPF program.
+ + *
+ + *            Applying **close**\ (2) to the file descriptor returned by
+ + *            **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_BTF_LOAD
+ + *    Description
+ + *            Verify and load BPF Type Format (BTF) metadata into the kernel,
+ + *            returning a new file descriptor associated with the metadata.
+ + *            BTF is described in more detail at
+ + *            https://www.kernel.org/doc/html/latest/bpf/btf.html.
+ + *
+ + *            The *btf* parameter must point to valid memory providing
+ + *            *btf_size* bytes of BTF binary metadata.
+ + *
+ + *            The returned file descriptor can be passed to other **bpf**\ ()
+ + *            subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
+ + *            associate the BTF with those objects.
+ + *
+ + *            Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
+ + *            parameters to specify a *btf_log_buf*, *btf_log_size* and
+ + *            *btf_log_level* which allow the kernel to return freeform log
+ + *            output regarding the BTF verification process.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_BTF_GET_FD_BY_ID
+ + *    Description
+ + *            Open a file descriptor for the BPF Type Format (BTF)
+ + *            corresponding to *btf_id*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_TASK_FD_QUERY
+ + *    Description
+ + *            Obtain information about eBPF programs associated with the
+ + *            target process identified by *pid* and *fd*.
+ + *
+ + *            If the *pid* and *fd* are associated with a tracepoint, kprobe
+ + *            or uprobe perf event, then the *prog_id* and *fd_type* will
+ + *            be populated with the eBPF program id and file descriptor type
+ + *            of type **bpf_task_fd_type**. If associated with a kprobe or
+ + *            uprobe, the  *probe_offset* and *probe_addr* will also be
+ + *            populated. Optionally, if *buf* is provided, then up to
+ + *            *buf_len* bytes of *buf* will be populated with the name of
+ + *            the tracepoint, kprobe or uprobe.
+ + *
+ + *            The resulting *prog_id* may be introspected in deeper detail
+ + *            using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_MAP_LOOKUP_AND_DELETE_ELEM
+ + *    Description
+ + *            Look up an element with the given *key* in the map referred to
+ + *            by the file descriptor *fd*, and if found, delete the element.
+ + *
+ + *            The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
+ + *            implement this command as a "pop" operation, deleting the top
+ + *            element rather than one corresponding to *key*.
+ + *            The *key* and *key_len* parameters should be zeroed when
+ + *            issuing this operation for these map types.
+ + *
+ + *            This command is only valid for the following map types:
+ + *            * **BPF_MAP_TYPE_QUEUE**
+ + *            * **BPF_MAP_TYPE_STACK**
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_MAP_FREEZE
+ + *    Description
+ + *            Freeze the permissions of the specified map.
+ + *
+ + *            Write permissions may be frozen by passing zero *flags*.
+ + *            Upon success, no future syscall invocations may alter the
+ + *            map state of *map_fd*. Write operations from eBPF programs
+ + *            are still possible for a frozen map.
+ + *
+ + *            Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_BTF_GET_NEXT_ID
+ + *    Description
+ + *            Fetch the next BPF Type Format (BTF) object currently loaded
+ + *            into the kernel.
+ + *
+ + *            Looks for the BTF object with an id greater than *start_id*
+ + *            and updates *next_id* on success. If no other BTF objects
+ + *            remain with ids higher than *start_id*, returns -1 and sets
+ + *            *errno* to **ENOENT**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, or when no id remains, -1
+ + *            is returned and *errno* is set appropriately.
+ + *
+ + * BPF_MAP_LOOKUP_BATCH
+ + *    Description
+ + *            Iterate and fetch multiple elements in a map.
+ + *
+ + *            Two opaque values are used to manage batch operations,
+ + *            *in_batch* and *out_batch*. Initially, *in_batch* must be set
+ + *            to NULL to begin the batched operation. After each subsequent
+ + *            **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
+ + *            *out_batch* as the *in_batch* for the next operation to
+ + *            continue iteration from the current point.
+ + *
+ + *            The *keys* and *values* are output parameters which must point
+ + *            to memory large enough to hold *count* items based on the key
+ + *            and value size of the map *map_fd*. The *keys* buffer must be
+ + *            of *key_size* * *count*. The *values* buffer must be of
+ + *            *value_size* * *count*.
+ + *
+ + *            The *elem_flags* argument may be specified as one of the
+ + *            following:
+ + *
+ + *            **BPF_F_LOCK**
+ + *                    Look up the value of a spin-locked map without
+ + *                    returning the lock. This must be specified if the
+ + *                    elements contain a spinlock.
+ + *
+ + *            On success, *count* elements from the map are copied into the
+ + *            user buffer, with the keys copied into *keys* and the values
+ + *            copied into the corresponding indices in *values*.
+ + *
+ + *            If an error is returned and *errno* is not **EFAULT**, *count*
+ + *            is set to the number of successfully processed elements.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + *            May set *errno* to **ENOSPC** to indicate that *keys* or
+ + *            *values* is too small to dump an entire bucket during
+ + *            iteration of a hash-based map type.
+ + *
+ + * BPF_MAP_LOOKUP_AND_DELETE_BATCH
+ + *    Description
+ + *            Iterate and delete all elements in a map.
+ + *
+ + *            This operation has the same behavior as
+ + *            **BPF_MAP_LOOKUP_BATCH** with two exceptions:
+ + *
+ + *            * Every element that is successfully returned is also deleted
+ + *              from the map. This is at least *count* elements. Note that
+ + *              *count* is both an input and an output parameter.
+ + *            * Upon returning with *errno* set to **EFAULT**, up to
+ + *              *count* elements may be deleted without returning the keys
+ + *              and values of the deleted elements.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_MAP_UPDATE_BATCH
+ + *    Description
+ + *            Update multiple elements in a map by *key*.
+ + *
+ + *            The *keys* and *values* are input parameters which must point
+ + *            to memory large enough to hold *count* items based on the key
+ + *            and value size of the map *map_fd*. The *keys* buffer must be
+ + *            of *key_size* * *count*. The *values* buffer must be of
+ + *            *value_size* * *count*.
+ + *
+ + *            Each element specified in *keys* is sequentially updated to the
+ + *            value in the corresponding index in *values*. The *in_batch*
+ + *            and *out_batch* parameters are ignored and should be zeroed.
+ + *
+ + *            The *elem_flags* argument should be specified as one of the
+ + *            following:
+ + *
+ + *            **BPF_ANY**
+ + *                    Create new elements or update a existing elements.
+ + *            **BPF_NOEXIST**
+ + *                    Create new elements only if they do not exist.
+ + *            **BPF_EXIST**
+ + *                    Update existing elements.
+ + *            **BPF_F_LOCK**
+ + *                    Update spin_lock-ed map elements. This must be
+ + *                    specified if the map value contains a spinlock.
+ + *
+ + *            On success, *count* elements from the map are updated.
+ + *
+ + *            If an error is returned and *errno* is not **EFAULT**, *count*
+ + *            is set to the number of successfully processed elements.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + *            May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
+ + *            **E2BIG**. **E2BIG** indicates that the number of elements in
+ + *            the map reached the *max_entries* limit specified at map
+ + *            creation time.
+ + *
+ + *            May set *errno* to one of the following error codes under
+ + *            specific circumstances:
+ + *
+ + *            **EEXIST**
+ + *                    If *flags* specifies **BPF_NOEXIST** and the element
+ + *                    with *key* already exists in the map.
+ + *            **ENOENT**
+ + *                    If *flags* specifies **BPF_EXIST** and the element with
+ + *                    *key* does not exist in the map.
+ + *
+ + * BPF_MAP_DELETE_BATCH
+ + *    Description
+ + *            Delete multiple elements in a map by *key*.
+ + *
+ + *            The *keys* parameter is an input parameter which must point
+ + *            to memory large enough to hold *count* items based on the key
+ + *            size of the map *map_fd*, that is, *key_size* * *count*.
+ + *
+ + *            Each element specified in *keys* is sequentially deleted. The
+ + *            *in_batch*, *out_batch*, and *values* parameters are ignored
+ + *            and should be zeroed.
+ + *
+ + *            The *elem_flags* argument may be specified as one of the
+ + *            following:
+ + *
+ + *            **BPF_F_LOCK**
+ + *                    Look up the value of a spin-locked map without
+ + *                    returning the lock. This must be specified if the
+ + *                    elements contain a spinlock.
+ + *
+ + *            On success, *count* elements from the map are updated.
+ + *
+ + *            If an error is returned and *errno* is not **EFAULT**, *count*
+ + *            is set to the number of successfully processed elements. If
+ + *            *errno* is **EFAULT**, up to *count* elements may be been
+ + *            deleted.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_LINK_CREATE
+ + *    Description
+ + *            Attach an eBPF program to a *target_fd* at the specified
+ + *            *attach_type* hook and return a file descriptor handle for
+ + *            managing the link.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_LINK_UPDATE
+ + *    Description
+ + *            Update the eBPF program in the specified *link_fd* to
+ + *            *new_prog_fd*.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_LINK_GET_FD_BY_ID
+ + *    Description
+ + *            Open a file descriptor for the eBPF Link corresponding to
+ + *            *link_id*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_LINK_GET_NEXT_ID
+ + *    Description
+ + *            Fetch the next eBPF link currently loaded into the kernel.
+ + *
+ + *            Looks for the eBPF link with an id greater than *start_id*
+ + *            and updates *next_id* on success. If no other eBPF links
+ + *            remain with ids higher than *start_id*, returns -1 and sets
+ + *            *errno* to **ENOENT**.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, or when no id remains, -1
+ + *            is returned and *errno* is set appropriately.
+ + *
+ + * BPF_ENABLE_STATS
+ + *    Description
+ + *            Enable eBPF runtime statistics gathering.
+ + *
+ + *            Runtime statistics gathering for the eBPF runtime is disabled
+ + *            by default to minimize the corresponding performance overhead.
+ + *            This command enables statistics globally.
+ + *
+ + *            Multiple programs may independently enable statistics.
+ + *            After gathering the desired statistics, eBPF runtime statistics
+ + *            may be disabled again by calling **close**\ (2) for the file
+ + *            descriptor returned by this function. Statistics will only be
+ + *            disabled system-wide when all outstanding file descriptors
+ + *            returned by prior calls for this subcommand are closed.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_ITER_CREATE
+ + *    Description
+ + *            Create an iterator on top of the specified *link_fd* (as
+ + *            previously created using **BPF_LINK_CREATE**) and return a
+ + *            file descriptor that can be used to trigger the iteration.
+ + *
+ + *            If the resulting file descriptor is pinned to the filesystem
+ + *            using  **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
+ + *            for that path will trigger the iterator to read kernel state
+ + *            using the eBPF program attached to *link_fd*.
+ + *
+ + *    Return
+ + *            A new file descriptor (a nonnegative integer), or -1 if an
+ + *            error occurred (in which case, *errno* is set appropriately).
+ + *
+ + * BPF_LINK_DETACH
+ + *    Description
+ + *            Forcefully detach the specified *link_fd* from its
+ + *            corresponding attachment point.
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * BPF_PROG_BIND_MAP
+ + *    Description
+ + *            Bind a map to the lifetime of an eBPF program.
+ + *
+ + *            The map identified by *map_fd* is bound to the program
+ + *            identified by *prog_fd* and only released when *prog_fd* is
+ + *            released. This may be used in cases where metadata should be
+ + *            associated with a program which otherwise does not contain any
+ + *            references to the map (for example, embedded in the eBPF
+ + *            program instructions).
+ + *
+ + *    Return
+ + *            Returns zero on success. On error, -1 is returned and *errno*
+ + *            is set appropriately.
+ + *
+ + * NOTES
+ + *    eBPF objects (maps and programs) can be shared between processes.
+ + *
+ + *    * After **fork**\ (2), the child inherits file descriptors
+ + *      referring to the same eBPF objects.
+ + *    * File descriptors referring to eBPF objects can be transferred over
+ + *      **unix**\ (7) domain sockets.
+ + *    * File descriptors referring to eBPF objects can be duplicated in the
+ + *      usual way, using **dup**\ (2) and similar calls.
+ + *    * File descriptors referring to eBPF objects can be pinned to the
+ + *      filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
+ + *
+ + *    An eBPF object is deallocated only after all file descriptors referring
+ + *    to the object have been closed and no references remain pinned to the
+ + *    filesystem or attached (for example, bound to a program or device).
+ + */
   enum bpf_cmd {
         BPF_MAP_CREATE,
         BPF_MAP_LOOKUP_ELEM,
@@@ -1103,15 -393,6 +1103,15 @@@ enum bpf_link_type 
    *                   is struct/union.
    */
   #define BPF_PSEUDO_BTF_ID     3
+ +/* insn[0].src_reg:  BPF_PSEUDO_FUNC
+ + * insn[0].imm:      insn offset to the func
+ + * insn[1].imm:      0
+ + * insn[0].off:      0
+ + * insn[1].off:      0
+ + * ldimm64 rewrite:  address of the function
+ + * verifier type:    PTR_TO_FUNC.
+ + */
+ +#define BPF_PSEUDO_FUNC               4
   
   /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
    * offset to another bpf function
@@@ -1439,7 -720,7 +1439,7 @@@ union bpf_attr 
    * parsed and used to produce a manual page. The workflow is the following,
    * and requires the rst2man utility:
    *
- - *     $ ./scripts/bpf_helpers_doc.py \
+ + *     $ ./scripts/bpf_doc.py \
    *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
    *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
    *     $ man /tmp/bpf-helpers.7
@@@ -2484,10 -1765,6 +2484,10 @@@
    *              Use with ENCAP_L3/L4 flags to further specify the tunnel
    *              type; *len* is the length of the inner MAC header.
    *
+ + *            * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
+ + *              Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
+ + *              L2 type as Ethernet.
+ + *
    *            A call to this helper is susceptible to change the underlying
    *            packet buffer. Therefore, at load time, all checks on pointers
    *            previously done by the verifier are invalidated and must be
@@@ -4573,7 -3850,7 +4573,7 @@@
    *
    * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
    *    Description
-  *            Check ctx packet size against exceeding MTU of net device (based
+  *            Check packet size against exceeding MTU of net device (based
    *            on *ifindex*).  This helper will likely be used in combination
    *            with helpers that adjust/change the packet size.
    *
@@@ -4590,6 -3867,14 +4590,14 @@@
    *            against the current net device.  This is practical if this isn't
    *            used prior to redirect.
    *
+  *            On input *mtu_len* must be a valid pointer, else verifier will
+  *            reject BPF program.  If the value *mtu_len* is initialized to
+  *            zero then the ctx packet size is use.  When value *mtu_len* is
+  *            provided as input this specify the L3 length that the MTU check
+  *            is done against. Remember XDP and TC length operate at L2, but
+  *            this value is L3 as this correlate to MTU and IP-header tot_len
+  *            values which are L3 (similar behavior as bpf_fib_lookup).
+  *
    *            The Linux kernel route table can configure MTUs on a more
    *            specific per route level, which is not provided by this helper.
    *            For route level MTU checks use the **bpf_fib_lookup**\ ()
@@@ -4614,11 -3899,9 +4622,9 @@@
    *
    *            On return *mtu_len* pointer contains the MTU value of the net
    *            device.  Remember the net device configured MTU is the L3 size,
-  *            which is returned here and XDP and TX length operate at L2.
+  *            which is returned here and XDP and TC length operate at L2.
    *            Helper take this into account for you, but remember when using
-  *            MTU value in your BPF-code.  On input *mtu_len* must be a valid
-  *            pointer and be initialized (to zero), else verifier will reject
-  *            BPF program.
+  *            MTU value in your BPF-code.
    *
    *    Return
    *            * 0 on success, and populate MTU value in *mtu_len* pointer.
@@@ -4632,34 -3915,6 +4638,34 @@@
    *            * **BPF_MTU_CHK_RET_FRAG_NEEDED**
    *            * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
    *
+ + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
+ + *    Description
+ + *            For each element in **map**, call **callback_fn** function with
+ + *            **map**, **callback_ctx** and other map-specific parameters.
+ + *            The **callback_fn** should be a static function and
+ + *            the **callback_ctx** should be a pointer to the stack.
+ + *            The **flags** is used to control certain aspects of the helper.
+ + *            Currently, the **flags** must be 0.
+ + *
+ + *            The following are a list of supported map types and their
+ + *            respective expected callback signatures:
+ + *
+ + *            BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
+ + *            BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
+ + *            BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
+ + *
+ + *            long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
+ + *
+ + *            For per_cpu maps, the map_value is the value on the cpu where the
+ + *            bpf_prog is running.
+ + *
+ + *            If **callback_fn** return 0, the helper will continue to the next
+ + *            element. If return value is 1, the helper will skip the rest of
+ + *            elements and return. Other return values are not used now.
+ + *
+ + *    Return
+ + *            The number of traversed map elements for success, **-EINVAL** for
+ + *            invalid **flags**.
    */
   #define __BPF_FUNC_MAPPER(FN)         \
         FN(unspec),                     \
@@@ -4826,7 -4081,6 +4832,7 @@@
         FN(ima_inode_hash),             \
         FN(sock_from_file),             \
         FN(check_mtu),                  \
+ +      FN(for_each_map_elem),          \
         /* */
   
   /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@@ -4920,7 -4174,6 +4926,7 @@@ enum 
         BPF_F_ADJ_ROOM_ENCAP_L4_GRE     = (1ULL << 3),
         BPF_F_ADJ_ROOM_ENCAP_L4_UDP     = (1ULL << 4),
         BPF_F_ADJ_ROOM_NO_CSUM_RESET    = (1ULL << 5),
+ +      BPF_F_ADJ_ROOM_ENCAP_L2_ETH     = (1ULL << 6),
   };
   
   enum {
@@@ -5958,10 -5211,7 +5964,10 @@@ struct bpf_pidns_info 
   
   /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
   struct bpf_sk_lookup {
- -      __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+ +      union {
+ +              __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
+ +              __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
+ +      };
   
         __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
         __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
diff --combined include/uapi/linux/psample.h

index c6329f7,bff5032..e585db5
--- 1/include/uapi/linux/psample.h
--- 2/include/uapi/linux/psample.h
+++ b/include/uapi/linux/psample.h
@@@ -3,7 -3,6 +3,6 @@@
   #define __UAPI_PSAMPLE_H
   
   enum {
-       /* sampled packet metadata */
         PSAMPLE_ATTR_IIFINDEX,
         PSAMPLE_ATTR_OIFINDEX,
         PSAMPLE_ATTR_ORIGSIZE,
@@@ -11,18 -10,9 +10,16 @@@
         PSAMPLE_ATTR_GROUP_SEQ,
         PSAMPLE_ATTR_SAMPLE_RATE,
         PSAMPLE_ATTR_DATA,
-       PSAMPLE_ATTR_TUNNEL,
- 
-       /* commands attributes */
         PSAMPLE_ATTR_GROUP_REFCOUNT,
+       PSAMPLE_ATTR_TUNNEL,
   
+ +      PSAMPLE_ATTR_PAD,
+ +      PSAMPLE_ATTR_OUT_TC,            /* u16 */
+ +      PSAMPLE_ATTR_OUT_TC_OCC,        /* u64, bytes */
+ +      PSAMPLE_ATTR_LATENCY,           /* u64, nanoseconds */
+ +      PSAMPLE_ATTR_TIMESTAMP,         /* u64, nanoseconds */
+ +      PSAMPLE_ATTR_PROTO,             /* u16 */
+ +
         __PSAMPLE_ATTR_MAX
   };
   
diff --combined init/Kconfig

index 2c9cbd8,5f5c776..5deae45
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -20,10 -20,10 +20,10 @@@ config CC_VERSION_TEX
             When the compiler is updated, Kconfig will be invoked.
   
           - Ensure full rebuild when the compiler is updated
-           include/linux/kconfig.h contains this option in the comment line so
-           fixdep adds include/config/cc/version/text.h into the auto-generated
-           dependency. When the compiler is updated, syncconfig will touch it
-           and then every file will be rebuilt.
+           include/linux/compiler-version.h contains this option in the comment
+           line so fixdep adds include/config/cc/version/text.h into the
+           auto-generated dependency. When the compiler is updated, syncconfig
+           will touch it and then every file will be rebuilt.
   
   config CC_IS_GCC
         def_bool $(success,test "$(cc-name)" = GCC)
@@@ -119,8 -119,7 +119,7 @@@ config INIT_ENV_ARG_LIMI
   
   config COMPILE_TEST
         bool "Compile also drivers which will not load"
-       depends on !UML && !S390
-       default n
+       depends on HAS_IOMEM
         help
           Some drivers can be compiled on a different platform than they are
           intended to be run on. Despite they cannot be loaded there (or even
@@@ -1709,7 -1708,6 +1708,7 @@@ config BPF_SYSCAL
         select BPF
         select IRQ_WORK
         select TASKS_TRACE_RCU
+ +      select NET_SOCK_MSG if INET
         default n
         help
           Enable the bpf() system call that allows to manipulate eBPF
diff --combined kernel/bpf/bpf_inode_storage.c

index da75372,b58b2ef..2921ca3
--- 1/kernel/bpf/bpf_inode_storage.c
--- 2/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@@ -109,7 -109,7 +109,7 @@@ static void *bpf_fd_inode_storage_looku
         fd = *(int *)key;
         f = fget_raw(fd);
         if (!f)
-               return NULL;
+               return ERR_PTR(-EBADF);
   
         sdata = inode_storage_lookup(f->f_inode, map, true);
         fput(f);
@@@ -237,7 -237,7 +237,7 @@@ static void inode_storage_map_free(stru
   
         smap = (struct bpf_local_storage_map *)map;
         bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
- -      bpf_local_storage_map_free(smap);
+ +      bpf_local_storage_map_free(smap, NULL);
   }
   
   static int inode_storage_map_btf_id;
diff --combined kernel/bpf/verifier.c

index f9096b0,44e4ec1..999bf36
--- 1/kernel/bpf/verifier.c
--- 2/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@@ -234,12 -234,6 +234,12 @@@ static bool bpf_pseudo_call(const struc
                insn->src_reg == BPF_PSEUDO_CALL;
   }
   
+ +static bool bpf_pseudo_func(const struct bpf_insn *insn)
+ +{
+ +      return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
+ +             insn->src_reg == BPF_PSEUDO_FUNC;
+ +}
+ +
   struct bpf_call_arg_meta {
         struct bpf_map *map_ptr;
         bool raw_mode;
@@@ -254,7 -248,6 +254,7 @@@
         u32 btf_id;
         struct btf *ret_btf;
         u32 ret_btf_id;
+ +      u32 subprogno;
   };
   
   struct btf *btf_vmlinux;
@@@ -397,24 -390,6 +397,24 @@@ __printf(3, 4) static void verbose_linf
         env->prev_linfo = linfo;
   }
   
+ +static void verbose_invalid_scalar(struct bpf_verifier_env *env,
+ +                                 struct bpf_reg_state *reg,
+ +                                 struct tnum *range, const char *ctx,
+ +                                 const char *reg_name)
+ +{
+ +      char tn_buf[48];
+ +
+ +      verbose(env, "At %s the register %s ", ctx, reg_name);
+ +      if (!tnum_is_unknown(reg->var_off)) {
+ +              tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ +              verbose(env, "has value %s", tn_buf);
+ +      } else {
+ +              verbose(env, "has unknown scalar value");
+ +      }
+ +      tnum_strn(tn_buf, sizeof(tn_buf), *range);
+ +      verbose(env, " should have been in %s\n", tn_buf);
+ +}
+ +
   static bool type_is_pkt_pointer(enum bpf_reg_type type)
   {
         return type == PTR_TO_PACKET ||
@@@ -434,7 -409,6 +434,7 @@@ static bool reg_type_not_null(enum bpf_
         return type == PTR_TO_SOCKET ||
                 type == PTR_TO_TCP_SOCK ||
                 type == PTR_TO_MAP_VALUE ||
+ +              type == PTR_TO_MAP_KEY ||
                 type == PTR_TO_SOCK_COMMON;
   }
   
@@@ -477,8 -451,7 +477,8 @@@ static bool arg_type_may_be_null(enum b
                type == ARG_PTR_TO_MEM_OR_NULL ||
                type == ARG_PTR_TO_CTX_OR_NULL ||
                type == ARG_PTR_TO_SOCKET_OR_NULL ||
- -             type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
+ +             type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
+ +             type == ARG_PTR_TO_STACK_OR_NULL;
   }
   
   /* Determine whether the function releases some resources allocated by another
@@@ -568,8 -541,6 +568,8 @@@ static const char * const reg_type_str[
         [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
         [PTR_TO_RDWR_BUF]       = "rdwr_buf",
         [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
+ +      [PTR_TO_FUNC]           = "func",
+ +      [PTR_TO_MAP_KEY]        = "map_key",
   };
   
   static char slot_type_char[] = {
@@@ -641,7 -612,6 +641,7 @@@ static void print_verifier_state(struc
                         if (type_is_pkt_pointer(t))
                                 verbose(env, ",r=%d", reg->range);
                         else if (t == CONST_PTR_TO_MAP ||
+ +                               t == PTR_TO_MAP_KEY ||
                                  t == PTR_TO_MAP_VALUE ||
                                  t == PTR_TO_MAP_VALUE_OR_NULL)
                                 verbose(env, ",ks=%d,vs=%d",
@@@ -1549,7 -1519,7 +1549,7 @@@ static int add_subprog(struct bpf_verif
         }
         ret = find_subprog(env, off);
         if (ret >= 0)
- -              return 0;
+ +              return ret;
         if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
                 verbose(env, "too many subprograms\n");
                 return -E2BIG;
@@@ -1557,7 -1527,7 +1557,7 @@@
         env->subprog_info[env->subprog_cnt++].start = off;
         sort(env->subprog_info, env->subprog_cnt,
              sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
- -      return 0;
+ +      return env->subprog_cnt - 1;
   }
   
   static int check_subprogs(struct bpf_verifier_env *env)
@@@ -1574,19 -1544,6 +1574,19 @@@
   
         /* determine subprog starts. The end is one before the next starts */
         for (i = 0; i < insn_cnt; i++) {
+ +              if (bpf_pseudo_func(insn + i)) {
+ +                      if (!env->bpf_capable) {
+ +                              verbose(env,
+ +                                      "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
+ +                              return -EPERM;
+ +                      }
+ +                      ret = add_subprog(env, i + insn[i].imm + 1);
+ +                      if (ret < 0)
+ +                              return ret;
+ +                      /* remember subprog */
+ +                      insn[i + 1].imm = ret;
+ +                      continue;
+ +              }
                 if (!bpf_pseudo_call(insn + i))
                         continue;
                 if (!env->bpf_capable) {
@@@ -2338,8 -2295,6 +2338,8 @@@ static bool is_spillable_regtype(enum b
         case PTR_TO_PERCPU_BTF_ID:
         case PTR_TO_MEM:
         case PTR_TO_MEM_OR_NULL:
+ +      case PTR_TO_FUNC:
+ +      case PTR_TO_MAP_KEY:
                 return true;
         default:
                 return false;
@@@ -2944,10 -2899,6 +2944,10 @@@ static int __check_mem_access(struct bp
   
         reg = &cur_regs(env)[regno];
         switch (reg->type) {
+ +      case PTR_TO_MAP_KEY:
+ +              verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
+ +                      mem_size, off, size);
+ +              break;
         case PTR_TO_MAP_VALUE:
                 verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                         mem_size, off, size);
@@@ -3353,9 -3304,6 +3353,9 @@@ static int check_ptr_alignment(struct b
         case PTR_TO_FLOW_KEYS:
                 pointer_desc = "flow keys ";
                 break;
+ +      case PTR_TO_MAP_KEY:
+ +              pointer_desc = "key ";
+ +              break;
         case PTR_TO_MAP_VALUE:
                 pointer_desc = "value ";
                 break;
@@@ -3457,7 -3405,7 +3457,7 @@@ process_func
   continue_func:
         subprog_end = subprog[idx + 1].start;
         for (; i < subprog_end; i++) {
- -              if (!bpf_pseudo_call(insn + i))
+ +              if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                         continue;
                 /* remember insn and function to return to */
                 ret_insn[frame] = i + 1;
@@@ -3894,19 -3842,7 +3894,19 @@@ static int check_mem_access(struct bpf_
         /* for access checks, reg->off is just part of off */
         off += reg->off;
   
- -      if (reg->type == PTR_TO_MAP_VALUE) {
+ +      if (reg->type == PTR_TO_MAP_KEY) {
+ +              if (t == BPF_WRITE) {
+ +                      verbose(env, "write to change key R%d not allowed\n", regno);
+ +                      return -EACCES;
+ +              }
+ +
+ +              err = check_mem_region_access(env, regno, off, size,
+ +                                            reg->map_ptr->key_size, false);
+ +              if (err)
+ +                      return err;
+ +              if (value_regno >= 0)
+ +                      mark_reg_unknown(env, regs, value_regno);
+ +      } else if (reg->type == PTR_TO_MAP_VALUE) {
                 if (t == BPF_WRITE && value_regno >= 0 &&
                     is_pointer_value(env, value_regno)) {
                         verbose(env, "R%d leaks addr into map\n", value_regno);
@@@ -4322,9 -4258,6 +4322,9 @@@ static int check_helper_mem_access(stru
         case PTR_TO_PACKET_META:
                 return check_packet_access(env, regno, reg->off, access_size,
                                            zero_size_allowed);
+ +      case PTR_TO_MAP_KEY:
+ +              return check_mem_region_access(env, regno, reg->off, access_size,
+ +                                             reg->map_ptr->key_size, false);
         case PTR_TO_MAP_VALUE:
                 if (check_map_access_type(env, regno, reg->off, access_size,
                                           meta && meta->raw_mode ? BPF_WRITE :
@@@ -4541,7 -4474,6 +4541,7 @@@ static const struct bpf_reg_types map_k
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+ +              PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
         },
   };
@@@ -4573,7 -4505,6 +4573,7 @@@ static const struct bpf_reg_types mem_t
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+ +              PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
                 PTR_TO_MEM,
                 PTR_TO_RDONLY_BUF,
@@@ -4586,7 -4517,6 +4586,7 @@@ static const struct bpf_reg_types int_p
                 PTR_TO_STACK,
                 PTR_TO_PACKET,
                 PTR_TO_PACKET_META,
+ +              PTR_TO_MAP_KEY,
                 PTR_TO_MAP_VALUE,
         },
   };
@@@ -4599,8 -4529,6 +4599,8 @@@ static const struct bpf_reg_types const
   static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
   static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
   static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
+ +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
+ +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
   
   static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
         [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
@@@ -4629,8 -4557,6 +4629,8 @@@
         [ARG_PTR_TO_INT]                = &int_ptr_types,
         [ARG_PTR_TO_LONG]               = &int_ptr_types,
         [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
+ +      [ARG_PTR_TO_FUNC]               = &func_ptr_types,
+ +      [ARG_PTR_TO_STACK_OR_NULL]      = &stack_ptr_types,
   };
   
   static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@@ -4812,8 -4738,6 +4812,8 @@@ skip_type_check
                         verbose(env, "verifier internal error\n");
                         return -EFAULT;
                 }
+ +      } else if (arg_type == ARG_PTR_TO_FUNC) {
+ +              meta->subprogno = reg->subprogno;
         } else if (arg_type_is_mem_ptr(arg_type)) {
                 /* The access to this pointer is only checked when we hit the
                  * next is_mem_size argument below.
@@@ -5334,19 -5258,13 +5334,19 @@@ static void clear_caller_saved_regs(str
         }
   }
   
- -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
- -                         int *insn_idx)
+ +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
+ +                                 struct bpf_func_state *caller,
+ +                                 struct bpf_func_state *callee,
+ +                                 int insn_idx);
+ +
+ +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ +                           int *insn_idx, int subprog,
+ +                           set_callee_state_fn set_callee_state_cb)
   {
         struct bpf_verifier_state *state = env->cur_state;
         struct bpf_func_info_aux *func_info_aux;
         struct bpf_func_state *caller, *callee;
- -      int i, err, subprog, target_insn;
+ +      int err;
         bool is_global = false;
   
         if (state->curframe + 1 >= MAX_CALL_FRAMES) {
@@@ -5355,6 -5273,14 +5355,6 @@@
                 return -E2BIG;
         }
   
- -      target_insn = *insn_idx + insn->imm;
- -      subprog = find_subprog(env, target_insn + 1);
- -      if (subprog < 0) {
- -              verbose(env, "verifier bug. No program starts at insn %d\n",
- -                      target_insn + 1);
- -              return -EFAULT;
- -      }
- -
         caller = state->frame[state->curframe];
         if (state->frame[state->curframe + 1]) {
                 verbose(env, "verifier bug. Frame %d already allocated\n",
@@@ -5409,9 -5335,11 +5409,9 @@@
         if (err)
                 return err;
   
- -      /* copy r1 - r5 args that callee can access.  The copy includes parent
- -       * pointers, which connects us up to the liveness chain
- -       */
- -      for (i = BPF_REG_1; i <= BPF_REG_5; i++)
- -              callee->regs[i] = caller->regs[i];
+ +      err = set_callee_state_cb(env, caller, callee, *insn_idx);
+ +      if (err)
+ +              return err;
   
         clear_caller_saved_regs(env, caller->regs);
   
@@@ -5419,7 -5347,7 +5419,7 @@@
         state->curframe++;
   
         /* and go analyze first insn of the callee */
- -      *insn_idx = target_insn;
+ +      *insn_idx = env->subprog_info[subprog].start - 1;
   
         if (env->log.level & BPF_LOG_LEVEL) {
                 verbose(env, "caller:\n");
@@@ -5430,92 -5358,6 +5430,92 @@@
         return 0;
   }
   
+ +int map_set_for_each_callback_args(struct bpf_verifier_env *env,
+ +                                 struct bpf_func_state *caller,
+ +                                 struct bpf_func_state *callee)
+ +{
+ +      /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
+ +       *      void *callback_ctx, u64 flags);
+ +       * callback_fn(struct bpf_map *map, void *key, void *value,
+ +       *      void *callback_ctx);
+ +       */
+ +      callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
+ +
+ +      callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
+ +      __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
+ +      callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+ +
+ +      callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
+ +      __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
+ +      callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
+ +
+ +      /* pointer to stack or null */
+ +      callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
+ +
+ +      /* unused */
+ +      __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
+ +      return 0;
+ +}
+ +
+ +static int set_callee_state(struct bpf_verifier_env *env,
+ +                          struct bpf_func_state *caller,
+ +                          struct bpf_func_state *callee, int insn_idx)
+ +{
+ +      int i;
+ +
+ +      /* copy r1 - r5 args that callee can access.  The copy includes parent
+ +       * pointers, which connects us up to the liveness chain
+ +       */
+ +      for (i = BPF_REG_1; i <= BPF_REG_5; i++)
+ +              callee->regs[i] = caller->regs[i];
+ +      return 0;
+ +}
+ +
+ +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ +                         int *insn_idx)
+ +{
+ +      int subprog, target_insn;
+ +
+ +      target_insn = *insn_idx + insn->imm + 1;
+ +      subprog = find_subprog(env, target_insn);
+ +      if (subprog < 0) {
+ +              verbose(env, "verifier bug. No program starts at insn %d\n",
+ +                      target_insn);
+ +              return -EFAULT;
+ +      }
+ +
+ +      return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
+ +}
+ +
+ +static int set_map_elem_callback_state(struct bpf_verifier_env *env,
+ +                                     struct bpf_func_state *caller,
+ +                                     struct bpf_func_state *callee,
+ +                                     int insn_idx)
+ +{
+ +      struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
+ +      struct bpf_map *map;
+ +      int err;
+ +
+ +      if (bpf_map_ptr_poisoned(insn_aux)) {
+ +              verbose(env, "tail_call abusing map_ptr\n");
+ +              return -EINVAL;
+ +      }
+ +
+ +      map = BPF_MAP_PTR(insn_aux->map_ptr_state);
+ +      if (!map->ops->map_set_for_each_callback_args ||
+ +          !map->ops->map_for_each_callback) {
+ +              verbose(env, "callback function not allowed for map\n");
+ +              return -ENOTSUPP;
+ +      }
+ +
+ +      err = map->ops->map_set_for_each_callback_args(env, caller, callee);
+ +      if (err)
+ +              return err;
+ +
+ +      callee->in_callback_fn = true;
+ +      return 0;
+ +}
+ +
   static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
   {
         struct bpf_verifier_state *state = env->cur_state;
@@@ -5538,22 -5380,8 +5538,22 @@@
   
         state->curframe--;
         caller = state->frame[state->curframe];
- -      /* return to the caller whatever r0 had in the callee */
- -      caller->regs[BPF_REG_0] = *r0;
+ +      if (callee->in_callback_fn) {
+ +              /* enforce R0 return value range [0, 1]. */
+ +              struct tnum range = tnum_range(0, 1);
+ +
+ +              if (r0->type != SCALAR_VALUE) {
+ +                      verbose(env, "R0 not a scalar value\n");
+ +                      return -EACCES;
+ +              }
+ +              if (!tnum_in(range, r0->var_off)) {
+ +                      verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
+ +                      return -EINVAL;
+ +              }
+ +      } else {
+ +              /* return to the caller whatever r0 had in the callee */
+ +              caller->regs[BPF_REG_0] = *r0;
+ +      }
   
         /* Transfer references to the caller */
         err = transfer_reference_state(caller, callee);
@@@ -5608,9 -5436,7 +5608,9 @@@ record_func_map(struct bpf_verifier_en
             func_id != BPF_FUNC_map_delete_elem &&
             func_id != BPF_FUNC_map_push_elem &&
             func_id != BPF_FUNC_map_pop_elem &&
- -          func_id != BPF_FUNC_map_peek_elem)
+ +          func_id != BPF_FUNC_map_peek_elem &&
+ +          func_id != BPF_FUNC_for_each_map_elem &&
+ +          func_id != BPF_FUNC_redirect_map)
                 return 0;
   
         if (map == NULL) {
@@@ -5691,18 -5517,15 +5691,18 @@@ static int check_reference_leak(struct 
         return state->acquired_refs ? -EINVAL : 0;
   }
   
- -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
+ +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
+ +                           int *insn_idx_p)
   {
         const struct bpf_func_proto *fn = NULL;
         struct bpf_reg_state *regs;
         struct bpf_call_arg_meta meta;
+ +      int insn_idx = *insn_idx_p;
         bool changes_data;
- -      int i, err;
+ +      int i, err, func_id;
   
         /* find function prototype */
+ +      func_id = insn->imm;
         if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
                 verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
                         func_id);
@@@ -5748,7 -5571,7 +5748,7 @@@
   
         meta.func_id = func_id;
         /* check args */
- -      for (i = 0; i < 5; i++) {
+ +      for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                 err = check_func_arg(env, i, &meta, fn);
                 if (err)
                         return err;
@@@ -5798,13 -5621,6 +5798,13 @@@
                 return -EINVAL;
         }
   
+ +      if (func_id == BPF_FUNC_for_each_map_elem) {
+ +              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
+ +                                      set_map_elem_callback_state);
+ +              if (err < 0)
+ +                      return -EINVAL;
+ +      }
+ +
         /* reset caller saved regs */
         for (i = 0; i < CALLER_SAVED_REGS; i++) {
                 mark_reg_not_init(env, regs, caller_saved[i]);
@@@ -6045,10 -5861,14 +6045,14 @@@ static int retrieve_ptr_limit(const str
   {
         bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
                             (opcode == BPF_SUB && !off_is_neg);
-       u32 off;
+       u32 off, max;
   
         switch (ptr_reg->type) {
         case PTR_TO_STACK:
+               /* Offset 0 is out-of-bounds, but acceptable start for the
+                * left direction, see BPF_REG_FP.
+                */
+               max = MAX_BPF_STACK + mask_to_left;
                 /* Indirect variable offset stack access is prohibited in
                  * unprivileged mode so it's not handled here.
                  */
@@@ -6056,29 -5876,17 +6060,30 @@@
                 if (mask_to_left)
                         *ptr_limit = MAX_BPF_STACK + off;
                 else
-                       *ptr_limit = -off;
-               return 0;
+                       *ptr_limit = -off - 1;
+               return *ptr_limit >= max ? -ERANGE : 0;
+ +      case PTR_TO_MAP_KEY:
+ +              /* Currently, this code is not exercised as the only use
+ +               * is bpf_for_each_map_elem() helper which requires
+ +               * bpf_capble. The code has been tested manually for
+ +               * future use.
+ +               */
+ +              if (mask_to_left) {
+ +                      *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+ +              } else {
+ +                      off = ptr_reg->smin_value + ptr_reg->off;
+ +                      *ptr_limit = ptr_reg->map_ptr->key_size - off;
+ +              }
+ +              return 0;
         case PTR_TO_MAP_VALUE:
+               max = ptr_reg->map_ptr->value_size;
                 if (mask_to_left) {
                         *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
                 } else {
                         off = ptr_reg->smin_value + ptr_reg->off;
-                       *ptr_limit = ptr_reg->map_ptr->value_size - off;
+                       *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
                 }
-               return 0;
+               return *ptr_limit >= max ? -ERANGE : 0;
         default:
                 return -EINVAL;
         }
@@@ -6101,7 -5909,7 +6106,7 @@@ static int update_alu_sanitation_state(
              aux->alu_limit != alu_limit))
                 return -EACCES;
   
- -      /* Corresponding fixup done in fixup_bpf_calls(). */
+ +      /* Corresponding fixup done in do_misc_fixups(). */
         aux->alu_state = alu_state;
         aux->alu_limit = alu_limit;
         return 0;
@@@ -6131,6 -5939,7 +6136,7 @@@ static int sanitize_ptr_alu(struct bpf_
         u32 alu_state, alu_limit;
         struct bpf_reg_state tmp;
         bool ret;
+       int err;
   
         if (can_skip_alu_sanitation(env, insn))
                 return 0;
@@@ -6146,10 -5955,13 +6152,13 @@@
         alu_state |= ptr_is_dst_reg ?
                      BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
   
-       if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
-               return 0;
-       if (update_alu_sanitation_state(aux, alu_state, alu_limit))
-               return -EACCES;
+       err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+       if (err < 0)
+               return err;
+ 
+       err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+       if (err < 0)
+               return err;
   do_sim:
         /* Simulate and find potential out-of-bounds access under
          * speculative execution from truncation as a result of
@@@ -6272,7 -6084,6 +6281,7 @@@ static int adjust_ptr_min_max_vals(stru
                 verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                         dst, reg_type_str[ptr_reg->type]);
                 return -EACCES;
+ +      case PTR_TO_MAP_KEY:
         case PTR_TO_MAP_VALUE:
                 if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
                         verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
@@@ -6301,7 -6112,7 +6310,7 @@@
         case BPF_ADD:
                 ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                 if (ret < 0) {
-                       verbose(env, "R%d tried to add from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
                         return ret;
                 }
                 /* We can take a fixed offset as long as it doesn't overflow
@@@ -6356,7 -6167,7 +6365,7 @@@
         case BPF_SUB:
                 ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                 if (ret < 0) {
-                       verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
                         return ret;
                 }
                 if (dst_reg == off_reg) {
@@@ -8452,24 -8263,6 +8461,24 @@@ static int check_ld_imm(struct bpf_veri
                 return 0;
         }
   
+ +      if (insn->src_reg == BPF_PSEUDO_FUNC) {
+ +              struct bpf_prog_aux *aux = env->prog->aux;
+ +              u32 subprogno = insn[1].imm;
+ +
+ +              if (!aux->func_info) {
+ +                      verbose(env, "missing btf func_info\n");
+ +                      return -EINVAL;
+ +              }
+ +              if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
+ +                      verbose(env, "callback function not static\n");
+ +                      return -EINVAL;
+ +              }
+ +
+ +              dst_reg->type = PTR_TO_FUNC;
+ +              dst_reg->subprogno = subprogno;
+ +              return 0;
+ +      }
+ +
         map = env->used_maps[aux->map_index];
         mark_reg_known_zero(env, regs, insn->dst_reg);
         dst_reg->map_ptr = map;
@@@ -8698,7 -8491,17 +8707,7 @@@ static int check_return_code(struct bpf
         }
   
         if (!tnum_in(range, reg->var_off)) {
- -              char tn_buf[48];
- -
- -              verbose(env, "At program exit the register R0 ");
- -              if (!tnum_is_unknown(reg->var_off)) {
- -                      tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- -                      verbose(env, "has value %s", tn_buf);
- -              } else {
- -                      verbose(env, "has unknown scalar value");
- -              }
- -              tnum_strn(tn_buf, sizeof(tn_buf), range);
- -              verbose(env, " should have been in %s\n", tn_buf);
+ +              verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
                 return -EINVAL;
         }
   
@@@ -8825,27 -8628,6 +8834,27 @@@ static int push_insn(int t, int w, int 
         return DONE_EXPLORING;
   }
   
+ +static int visit_func_call_insn(int t, int insn_cnt,
+ +                              struct bpf_insn *insns,
+ +                              struct bpf_verifier_env *env,
+ +                              bool visit_callee)
+ +{
+ +      int ret;
+ +
+ +      ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
+ +      if (ret)
+ +              return ret;
+ +
+ +      if (t + 1 < insn_cnt)
+ +              init_explored_state(env, t + 1);
+ +      if (visit_callee) {
+ +              init_explored_state(env, t);
+ +              ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
+ +                              env, false);
+ +      }
+ +      return ret;
+ +}
+ +
   /* Visits the instruction at index t and returns one of the following:
    *  < 0 - an error occurred
    *  DONE_EXPLORING - the instruction was fully explored
@@@ -8856,9 -8638,6 +8865,9 @@@ static int visit_insn(int t, int insn_c
         struct bpf_insn *insns = env->prog->insnsi;
         int ret;
   
+ +      if (bpf_pseudo_func(insns + t))
+ +              return visit_func_call_insn(t, insn_cnt, insns, env, true);
+ +
         /* All non-branch instructions have a single fall-through edge. */
         if (BPF_CLASS(insns[t].code) != BPF_JMP &&
             BPF_CLASS(insns[t].code) != BPF_JMP32)
@@@ -8869,8 -8648,18 +8878,8 @@@
                 return DONE_EXPLORING;
   
         case BPF_CALL:
- -              ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
- -              if (ret)
- -                      return ret;
- -
- -              if (t + 1 < insn_cnt)
- -                      init_explored_state(env, t + 1);
- -              if (insns[t].src_reg == BPF_PSEUDO_CALL) {
- -                      init_explored_state(env, t);
- -                      ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
- -                                      env, false);
- -              }
- -              return ret;
+ +              return visit_func_call_insn(t, insn_cnt, insns, env,
+ +                                          insns[t].src_reg == BPF_PSEUDO_CALL);
   
         case BPF_JA:
                 if (BPF_SRC(insns[t].code) != BPF_K)
@@@ -9276,6 -9065,10 +9285,10 @@@ static int check_btf_info(struct bpf_ve
         btf = btf_get_by_fd(attr->prog_btf_fd);
         if (IS_ERR(btf))
                 return PTR_ERR(btf);
+       if (btf_is_kernel(btf)) {
+               btf_put(btf);
+               return -EACCES;
+       }
         env->prog->aux->btf = btf;
   
         err = check_btf_func(env, attr, uattr);
@@@ -9479,7 -9272,6 +9492,7 @@@ static bool regsafe(struct bpf_reg_stat
                          */
                         return false;
                 }
+ +      case PTR_TO_MAP_KEY:
         case PTR_TO_MAP_VALUE:
                 /* If the new min/max/var_off satisfy the old ones and
                  * everything else matches, we are OK.
@@@ -10326,9 -10118,10 +10339,9 @@@ static int do_check(struct bpf_verifier
                                 if (insn->src_reg == BPF_PSEUDO_CALL)
                                         err = check_func_call(env, insn, &env->insn_idx);
                                 else
- -                                      err = check_helper_call(env, insn->imm, env->insn_idx);
+ +                                      err = check_helper_call(env, insn, &env->insn_idx);
                                 if (err)
                                         return err;
- -
                         } else if (opcode == BPF_JA) {
                                 if (BPF_SRC(insn->code) != BPF_K ||
                                     insn->imm != 0 ||
@@@ -10757,12 -10550,6 +10770,12 @@@ static int resolve_pseudo_ldimm64(struc
                                 goto next_insn;
                         }
   
+ +                      if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
+ +                              aux = &env->insn_aux_data[i];
+ +                              aux->ptr_type = PTR_TO_FUNC;
+ +                              goto next_insn;
+ +                      }
+ +
                         /* In final convert_pseudo_ld_imm64() step, this is
                          * converted into regular 64-bit imm load insn.
                          */
@@@ -10895,13 -10682,9 +10908,13 @@@ static void convert_pseudo_ld_imm64(str
         int insn_cnt = env->prog->len;
         int i;
   
- -      for (i = 0; i < insn_cnt; i++, insn++)
- -              if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
- -                      insn->src_reg = 0;
+ +      for (i = 0; i < insn_cnt; i++, insn++) {
+ +              if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
+ +                      continue;
+ +              if (insn->src_reg == BPF_PSEUDO_FUNC)
+ +                      continue;
+ +              insn->src_reg = 0;
+ +      }
   }
   
   /* single env->prog->insni[off] instruction was replaced with the range
@@@ -11540,12 -11323,6 +11553,12 @@@ static int jit_subprogs(struct bpf_veri
                 return 0;
   
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ +              if (bpf_pseudo_func(insn)) {
+ +                      env->insn_aux_data[i].call_imm = insn->imm;
+ +                      /* subprog is encoded in insn[1].imm */
+ +                      continue;
+ +              }
+ +
                 if (!bpf_pseudo_call(insn))
                         continue;
                 /* Upon error here we cannot fall back to interpreter but
@@@ -11675,12 -11452,6 +11688,12 @@@
         for (i = 0; i < env->subprog_cnt; i++) {
                 insn = func[i]->insnsi;
                 for (j = 0; j < func[i]->len; j++, insn++) {
+ +                      if (bpf_pseudo_func(insn)) {
+ +                              subprog = insn[1].imm;
+ +                              insn[0].imm = (u32)(long)func[subprog]->bpf_func;
+ +                              insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
+ +                              continue;
+ +                      }
                         if (!bpf_pseudo_call(insn))
                                 continue;
                         subprog = insn->off;
@@@ -11726,11 -11497,6 +11739,11 @@@
          * later look the same as if they were interpreted only.
          */
         for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
+ +              if (bpf_pseudo_func(insn)) {
+ +                      insn[0].imm = env->insn_aux_data[i].call_imm;
+ +                      insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
+ +                      continue;
+ +              }
                 if (!bpf_pseudo_call(insn))
                         continue;
                 insn->off = env->insn_aux_data[i].call_imm;
@@@ -11795,14 -11561,6 +11808,14 @@@ static int fixup_call_args(struct bpf_v
                 return -EINVAL;
         }
         for (i = 0; i < prog->len; i++, insn++) {
+ +              if (bpf_pseudo_func(insn)) {
+ +                      /* When JIT fails the progs with callback calls
+ +                       * have to be rejected, since interpreter doesn't support them yet.
+ +                       */
+ +                      verbose(env, "callbacks are not allowed in non-JITed programs\n");
+ +                      return -EINVAL;
+ +              }
+ +
                 if (!bpf_pseudo_call(insn))
                         continue;
                 depth = get_callee_stack_depth(env, insn, i);
@@@ -11815,10 -11573,12 +11828,10 @@@
         return err;
   }
   
- -/* fixup insn->imm field of bpf_call instructions
- - * and inline eligible helpers as explicit sequence of BPF instructions
- - *
- - * this function is called after eBPF program passed verification
+ +/* Do various post-verification rewrites in a single program pass.
+ + * These rewrites simplify JIT and interpreter implementations.
    */
- -static int fixup_bpf_calls(struct bpf_verifier_env *env)
+ +static int do_misc_fixups(struct bpf_verifier_env *env)
   {
         struct bpf_prog *prog = env->prog;
         bool expect_blinding = bpf_jit_blinding_enabled(prog);
@@@ -11833,7 -11593,6 +11846,7 @@@
         int i, ret, cnt, delta = 0;
   
         for (i = 0; i < insn_cnt; i++, insn++) {
+ +              /* Make divide-by-zero exceptions impossible. */
                 if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                     insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
                     insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
@@@ -11874,7 -11633,6 +11887,7 @@@
                         continue;
                 }
   
+ +              /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
                 if (BPF_CLASS(insn->code) == BPF_LD &&
                     (BPF_MODE(insn->code) == BPF_ABS ||
                      BPF_MODE(insn->code) == BPF_IND)) {
@@@ -11894,7 -11652,6 +11907,7 @@@
                         continue;
                 }
   
+ +              /* Rewrite pointer arithmetic to mitigate speculation attacks. */
                 if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
                     insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
                         const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
@@@ -11916,7 -11673,7 +11929,7 @@@
                         off_reg = issrc ? insn->src_reg : insn->dst_reg;
                         if (isneg)
                                 *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                         *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
                         *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
                         *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
@@@ -12043,8 -11800,7 +12056,8 @@@
                      insn->imm == BPF_FUNC_map_delete_elem ||
                      insn->imm == BPF_FUNC_map_push_elem   ||
                      insn->imm == BPF_FUNC_map_pop_elem    ||
- -                   insn->imm == BPF_FUNC_map_peek_elem)) {
+ +                   insn->imm == BPF_FUNC_map_peek_elem   ||
+ +                   insn->imm == BPF_FUNC_redirect_map)) {
                         aux = &env->insn_aux_data[i + delta];
                         if (bpf_map_ptr_poisoned(aux))
                                 goto patch_call_imm;
@@@ -12086,9 -11842,6 +12099,9 @@@
                                      (int (*)(struct bpf_map *map, void *value))NULL));
                         BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                      (int (*)(struct bpf_map *map, void *value))NULL));
+ +                      BUILD_BUG_ON(!__same_type(ops->map_redirect,
+ +                                   (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
+ +
   patch_map_ops_generic:
                         switch (insn->imm) {
                         case BPF_FUNC_map_lookup_elem:
@@@ -12115,16 -11868,11 +12128,16 @@@
                                 insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
                                             __bpf_call_base;
                                 continue;
+ +                      case BPF_FUNC_redirect_map:
+ +                              insn->imm = BPF_CAST_CALL(ops->map_redirect) -
+ +                                          __bpf_call_base;
+ +                              continue;
                         }
   
                         goto patch_call_imm;
                 }
   
+ +              /* Implement bpf_jiffies64 inline. */
                 if (prog->jit_requested && BITS_PER_LONG == 64 &&
                     insn->imm == BPF_FUNC_jiffies64) {
                         struct bpf_insn ld_jiffies_addr[2] = {
@@@ -12935,7 -12683,7 +12948,7 @@@ skip_full_check
                 ret = convert_ctx_accesses(env);
   
         if (ret == 0)
- -              ret = fixup_bpf_calls(env);
+ +              ret = do_misc_fixups(env);
   
         /* do 32-bit optimization after insn patching has done so those patched
          * insns could be handled correctly.
diff --combined kernel/fork.c

index b94391a,54cc905..5020969
--- 1/kernel/fork.c
--- 2/kernel/fork.c
+++ b/kernel/fork.c
@@@ -96,7 -96,6 +96,7 @@@
   #include <linux/kasan.h>
   #include <linux/scs.h>
   #include <linux/io_uring.h>
+ +#include <linux/bpf.h>
   
   #include <asm/pgalloc.h>
   #include <linux/uaccess.h>
@@@ -735,7 -734,6 +735,7 @@@ void __put_task_struct(struct task_stru
         cgroup_free(tsk);
         task_numa_free(tsk, true);
         security_task_free(tsk);
+ +      bpf_task_storage_free(tsk);
         exit_creds(tsk);
         delayacct_tsk_free(tsk);
         put_signal_struct(tsk->signal);
@@@ -996,6 -994,13 +996,13 @@@ static void mm_init_owner(struct mm_str
   #endif
   }
   
+ static void mm_init_pasid(struct mm_struct *mm)
+ {
+ #ifdef CONFIG_IOMMU_SUPPORT
+       mm->pasid = INIT_PASID;
+ #endif
+ }
+ 
   static void mm_init_uprobes_state(struct mm_struct *mm)
   {
   #ifdef CONFIG_UPROBES
@@@ -1026,6 -1031,7 +1033,7 @@@ static struct mm_struct *mm_init(struc
         mm_init_cpumask(mm);
         mm_init_aio(mm);
         mm_init_owner(mm, p);
+       mm_init_pasid(mm);
         RCU_INIT_POINTER(mm->exe_file, NULL);
         mmu_notifier_subscriptions_init(mm);
         init_tlb_flush_pending(mm);
@@@ -2066,9 -2072,6 +2074,9 @@@ static __latent_entropy struct task_str
         p->sequential_io        = 0;
         p->sequential_io_avg    = 0;
   #endif
+ +#ifdef CONFIG_BPF_SYSCALL
+ +      RCU_INIT_POINTER(p->bpf_storage, NULL);
+ +#endif
   
         /* Perform scheduler related setup. Assign this task to a CPU. */
         retval = sched_fork(clone_flags, p);
diff --combined net/core/dev.c

index 4bb6dcd,0f72ff5..48b529d
--- 1/net/core/dev.c
--- 2/net/core/dev.c
+++ b/net/core/dev.c
@@@ -848,52 -848,6 +848,52 @@@ int dev_fill_metadata_dst(struct net_de
   }
   EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
   
+ +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
+ +{
+ +      int k = stack->num_paths++;
+ +
+ +      if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
+ +              return NULL;
+ +
+ +      return &stack->path[k];
+ +}
+ +
+ +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
+ +                        struct net_device_path_stack *stack)
+ +{
+ +      const struct net_device *last_dev;
+ +      struct net_device_path_ctx ctx = {
+ +              .dev    = dev,
+ +              .daddr  = daddr,
+ +      };
+ +      struct net_device_path *path;
+ +      int ret = 0;
+ +
+ +      stack->num_paths = 0;
+ +      while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
+ +              last_dev = ctx.dev;
+ +              path = dev_fwd_path(stack);
+ +              if (!path)
+ +                      return -1;
+ +
+ +              memset(path, 0, sizeof(struct net_device_path));
+ +              ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
+ +              if (ret < 0)
+ +                      return -1;
+ +
+ +              if (WARN_ON_ONCE(last_dev == ctx.dev))
+ +                      return -1;
+ +      }
+ +      path = dev_fwd_path(stack);
+ +      if (!path)
+ +              return -1;
+ +      path->type = DEV_PATH_ETHERNET;
+ +      path->dev = ctx.dev;
+ +
+ +      return ret;
+ +}
+ +EXPORT_SYMBOL_GPL(dev_fill_forward_path);
+ +
   /**
    *    __dev_get_by_name       - find a device by its name
    *    @net: the applicable net namespace
@@@ -1230,6 -1184,18 +1230,18 @@@ static int __dev_alloc_name(struct net 
                         return -ENOMEM;
   
                 for_each_netdev(net, d) {
+                       struct netdev_name_node *name_node;
+                       list_for_each_entry(name_node, &d->name_node->list, list) {
+                               if (!sscanf(name_node->name, name, &i))
+                                       continue;
+                               if (i < 0 || i >= max_netdevices)
+                                       continue;
+ 
+                               /*  avoid cases where sscanf is not exact inverse of printf */
+                               snprintf(buf, IFNAMSIZ, name, i);
+                               if (!strncmp(buf, name_node->name, IFNAMSIZ))
+                                       set_bit(i, inuse);
+                       }
                         if (!sscanf(d->name, name, &i))
                                 continue;
                         if (i < 0 || i >= max_netdevices)
@@@ -2497,14 -2463,16 +2509,14 @@@ int netdev_txq_to_tc(struct net_device 
   EXPORT_SYMBOL(netdev_txq_to_tc);
   
   #ifdef CONFIG_XPS
- -struct static_key xps_needed __read_mostly;
- -EXPORT_SYMBOL(xps_needed);
- -struct static_key xps_rxqs_needed __read_mostly;
- -EXPORT_SYMBOL(xps_rxqs_needed);
+ +static struct static_key xps_needed __read_mostly;
+ +static struct static_key xps_rxqs_needed __read_mostly;
   static DEFINE_MUTEX(xps_map_mutex);
   #define xmap_dereference(P)           \
         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
   
   static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
- -                           int tci, u16 index)
+ +                           struct xps_dev_maps *old_maps, int tci, u16 index)
   {
         struct xps_map *map = NULL;
         int pos;
@@@ -2523,8 -2491,6 +2535,8 @@@
                         break;
                 }
   
+ +              if (old_maps)
+ +                      RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                 RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                 kfree_rcu(map, rcu);
                 return false;
@@@ -2537,7 -2503,7 +2549,7 @@@ static bool remove_xps_queue_cpu(struc
                                  struct xps_dev_maps *dev_maps,
                                  int cpu, u16 offset, u16 count)
   {
- -      int num_tc = dev->num_tc ? : 1;
+ +      int num_tc = dev_maps->num_tc;
         bool active = false;
         int tci;
   
@@@ -2545,7 -2511,7 +2557,7 @@@
                 int i, j;
   
                 for (i = count, j = offset; i--; j++) {
- -                      if (!remove_xps_queue(dev_maps, tci, j))
+ +                      if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                 break;
                 }
   
@@@ -2557,54 -2523,74 +2569,54 @@@
   
   static void reset_xps_maps(struct net_device *dev,
                            struct xps_dev_maps *dev_maps,
- -                         bool is_rxqs_map)
+ +                         enum xps_map_type type)
   {
- -      if (is_rxqs_map) {
- -              static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
- -              RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
- -      } else {
- -              RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
- -      }
         static_key_slow_dec_cpuslocked(&xps_needed);
+ +      if (type == XPS_RXQS)
+ +              static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
+ +
+ +      RCU_INIT_POINTER(dev->xps_maps[type], NULL);
+ +
         kfree_rcu(dev_maps, rcu);
   }
   
- -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
- -                         struct xps_dev_maps *dev_maps, unsigned int nr_ids,
- -                         u16 offset, u16 count, bool is_rxqs_map)
+ +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
+ +                         u16 offset, u16 count)
   {
+ +      struct xps_dev_maps *dev_maps;
         bool active = false;
         int i, j;
   
- -      for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
- -           j < nr_ids;)
- -              active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
- -                                             count);
+ +      dev_maps = xmap_dereference(dev->xps_maps[type]);
+ +      if (!dev_maps)
+ +              return;
+ +
+ +      for (j = 0; j < dev_maps->nr_ids; j++)
+ +              active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
         if (!active)
- -              reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ +              reset_xps_maps(dev, dev_maps, type);
   
- -      if (!is_rxqs_map) {
- -              for (i = offset + (count - 1); count--; i--) {
+ +      if (type == XPS_CPUS) {
+ +              for (i = offset + (count - 1); count--; i--)
                         netdev_queue_numa_node_write(
- -                              netdev_get_tx_queue(dev, i),
- -                              NUMA_NO_NODE);
- -              }
+ +                              netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
         }
   }
   
   static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                    u16 count)
   {
- -      const unsigned long *possible_mask = NULL;
- -      struct xps_dev_maps *dev_maps;
- -      unsigned int nr_ids;
- -
         if (!static_key_false(&xps_needed))
                 return;
   
         cpus_read_lock();
         mutex_lock(&xps_map_mutex);
   
- -      if (static_key_false(&xps_rxqs_needed)) {
- -              dev_maps = xmap_dereference(dev->xps_rxqs_map);
- -              if (dev_maps) {
- -                      nr_ids = dev->num_rx_queues;
- -                      clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
- -                                     offset, count, true);
- -              }
- -      }
- -
- -      dev_maps = xmap_dereference(dev->xps_cpus_map);
- -      if (!dev_maps)
- -              goto out_no_maps;
+ +      if (static_key_false(&xps_rxqs_needed))
+ +              clean_xps_maps(dev, XPS_RXQS, offset, count);
   
- -      if (num_possible_cpus() > 1)
- -              possible_mask = cpumask_bits(cpu_possible_mask);
- -      nr_ids = nr_cpu_ids;
- -      clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
- -                     false);
+ +      clean_xps_maps(dev, XPS_CPUS, offset, count);
   
- -out_no_maps:
         mutex_unlock(&xps_map_mutex);
         cpus_read_unlock();
   }
@@@ -2654,35 -2640,16 +2666,35 @@@ static struct xps_map *expand_xps_map(s
         return new_map;
   }
   
+ +/* Copy xps maps at a given index */
+ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
+ +                            struct xps_dev_maps *new_dev_maps, int index,
+ +                            int tc, bool skip_tc)
+ +{
+ +      int i, tci = index * dev_maps->num_tc;
+ +      struct xps_map *map;
+ +
+ +      /* copy maps belonging to foreign traffic classes */
+ +      for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ +              if (i == tc && skip_tc)
+ +                      continue;
+ +
+ +              /* fill in the new device map from the old device map */
+ +              map = xmap_dereference(dev_maps->attr_map[tci]);
+ +              RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
+ +      }
+ +}
+ +
   /* Must be called under cpus_read_lock */
   int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
- -                        u16 index, bool is_rxqs_map)
+ +                        u16 index, enum xps_map_type type)
   {
- -      const unsigned long *online_mask = NULL, *possible_mask = NULL;
- -      struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
+ +      struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
+ +      const unsigned long *online_mask = NULL;
+ +      bool active = false, copy = false;
         int i, j, tci, numa_node_id = -2;
         int maps_sz, num_tc = 1, tc = 0;
         struct xps_map *map, *new_map;
- -      bool active = false;
         unsigned int nr_ids;
   
         if (dev->num_tc) {
@@@ -2700,48 -2667,38 +2712,48 @@@
         }
   
         mutex_lock(&xps_map_mutex);
- -      if (is_rxqs_map) {
+ +
+ +      dev_maps = xmap_dereference(dev->xps_maps[type]);
+ +      if (type == XPS_RXQS) {
                 maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
- -              dev_maps = xmap_dereference(dev->xps_rxqs_map);
                 nr_ids = dev->num_rx_queues;
         } else {
                 maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
- -              if (num_possible_cpus() > 1) {
+ +              if (num_possible_cpus() > 1)
                         online_mask = cpumask_bits(cpu_online_mask);
- -                      possible_mask = cpumask_bits(cpu_possible_mask);
- -              }
- -              dev_maps = xmap_dereference(dev->xps_cpus_map);
                 nr_ids = nr_cpu_ids;
         }
   
         if (maps_sz < L1_CACHE_BYTES)
                 maps_sz = L1_CACHE_BYTES;
   
+ +      /* The old dev_maps could be larger or smaller than the one we're
+ +       * setting up now, as dev->num_tc or nr_ids could have been updated in
+ +       * between. We could try to be smart, but let's be safe instead and only
+ +       * copy foreign traffic classes if the two map sizes match.
+ +       */
+ +      if (dev_maps &&
+ +          dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
+ +              copy = true;
+ +
         /* allocate memory for queue storage */
         for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
              j < nr_ids;) {
- -              if (!new_dev_maps)
- -                      new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                 if (!new_dev_maps) {
- -                      mutex_unlock(&xps_map_mutex);
- -                      return -ENOMEM;
+ +                      new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
+ +                      if (!new_dev_maps) {
+ +                              mutex_unlock(&xps_map_mutex);
+ +                              return -ENOMEM;
+ +                      }
+ +
+ +                      new_dev_maps->nr_ids = nr_ids;
+ +                      new_dev_maps->num_tc = num_tc;
                 }
   
                 tci = j * num_tc + tc;
- -              map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
- -                               NULL;
+ +              map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
   
- -              map = expand_xps_map(map, j, index, is_rxqs_map);
+ +              map = expand_xps_map(map, j, index, type == XPS_RXQS);
                 if (!map)
                         goto error;
   
@@@ -2754,21 -2711,29 +2766,21 @@@
         if (!dev_maps) {
                 /* Increment static keys at most once per type */
                 static_key_slow_inc_cpuslocked(&xps_needed);
- -              if (is_rxqs_map)
+ +              if (type == XPS_RXQS)
                         static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
         }
   
- -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- -           j < nr_ids;) {
- -              /* copy maps belonging to foreign traffic classes */
- -              for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
- -                      /* fill in the new device map from the old device map */
- -                      map = xmap_dereference(dev_maps->attr_map[tci]);
- -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- -              }
+ +      for (j = 0; j < nr_ids; j++) {
+ +              bool skip_tc = false;
   
- -              /* We need to explicitly update tci as prevous loop
- -               * could break out early if dev_maps is NULL.
- -               */
                 tci = j * num_tc + tc;
- -
                 if (netif_attr_test_mask(j, mask, nr_ids) &&
                     netif_attr_test_online(j, online_mask, nr_ids)) {
                         /* add tx-queue to CPU/rx-queue maps */
                         int pos = 0;
   
+ +                      skip_tc = true;
+ +
                         map = xmap_dereference(new_dev_maps->attr_map[tci]);
                         while ((pos < map->len) && (map->queues[pos] != index))
                                 pos++;
@@@ -2776,81 -2741,78 +2788,81 @@@
                         if (pos == map->len)
                                 map->queues[map->len++] = index;
   #ifdef CONFIG_NUMA
- -                      if (!is_rxqs_map) {
+ +                      if (type == XPS_CPUS) {
                                 if (numa_node_id == -2)
                                         numa_node_id = cpu_to_node(j);
                                 else if (numa_node_id != cpu_to_node(j))
                                         numa_node_id = -1;
                         }
   #endif
- -              } else if (dev_maps) {
- -                      /* fill in the new device map from the old device map */
- -                      map = xmap_dereference(dev_maps->attr_map[tci]);
- -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
                 }
   
- -              /* copy maps belonging to foreign traffic classes */
- -              for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
- -                      /* fill in the new device map from the old device map */
- -                      map = xmap_dereference(dev_maps->attr_map[tci]);
- -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
- -              }
+ +              if (copy)
+ +                      xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
+ +                                        skip_tc);
         }
   
- -      if (is_rxqs_map)
- -              rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
- -      else
- -              rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
+ +      rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
   
         /* Cleanup old maps */
         if (!dev_maps)
                 goto out_no_old_maps;
   
- -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- -           j < nr_ids;) {
- -              for (i = num_tc, tci = j * num_tc; i--; tci++) {
- -                      new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ +      for (j = 0; j < dev_maps->nr_ids; j++) {
+ +              for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                         map = xmap_dereference(dev_maps->attr_map[tci]);
- -                      if (map && map != new_map)
- -                              kfree_rcu(map, rcu);
+ +                      if (!map)
+ +                              continue;
+ +
+ +                      if (copy) {
+ +                              new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
+ +                              if (map == new_map)
+ +                                      continue;
+ +                      }
+ +
+ +                      RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
+ +                      kfree_rcu(map, rcu);
                 }
         }
   
- -      kfree_rcu(dev_maps, rcu);
+ +      old_dev_maps = dev_maps;
   
   out_no_old_maps:
         dev_maps = new_dev_maps;
         active = true;
   
   out_no_new_maps:
- -      if (!is_rxqs_map) {
+ +      if (type == XPS_CPUS)
                 /* update Tx queue numa node */
                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                              (numa_node_id >= 0) ?
                                              numa_node_id : NUMA_NO_NODE);
- -      }
   
         if (!dev_maps)
                 goto out_no_maps;
   
         /* removes tx-queue from unused CPUs/rx-queues */
- -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- -           j < nr_ids;) {
- -              for (i = tc, tci = j * num_tc; i--; tci++)
- -                      active |= remove_xps_queue(dev_maps, tci, index);
- -              if (!netif_attr_test_mask(j, mask, nr_ids) ||
- -                  !netif_attr_test_online(j, online_mask, nr_ids))
- -                      active |= remove_xps_queue(dev_maps, tci, index);
- -              for (i = num_tc - tc, tci++; --i; tci++)
- -                      active |= remove_xps_queue(dev_maps, tci, index);
+ +      for (j = 0; j < dev_maps->nr_ids; j++) {
+ +              tci = j * dev_maps->num_tc;
+ +
+ +              for (i = 0; i < dev_maps->num_tc; i++, tci++) {
+ +                      if (i == tc &&
+ +                          netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
+ +                          netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
+ +                              continue;
+ +
+ +                      active |= remove_xps_queue(dev_maps,
+ +                                                 copy ? old_dev_maps : NULL,
+ +                                                 tci, index);
+ +              }
         }
   
+ +      if (old_dev_maps)
+ +              kfree_rcu(old_dev_maps, rcu);
+ +
         /* free map if not active */
         if (!active)
- -              reset_xps_maps(dev, dev_maps, is_rxqs_map);
+ +              reset_xps_maps(dev, dev_maps, type);
   
   out_no_maps:
         mutex_unlock(&xps_map_mutex);
@@@ -2858,10 -2820,11 +2870,10 @@@
         return 0;
   error:
         /* remove any maps that we added */
- -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
- -           j < nr_ids;) {
+ +      for (j = 0; j < nr_ids; j++) {
                 for (i = num_tc, tci = j * num_tc; i--; tci++) {
                         new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
- -                      map = dev_maps ?
+ +                      map = copy ?
                               xmap_dereference(dev_maps->attr_map[tci]) :
                               NULL;
                         if (new_map && new_map != map)
@@@ -2882,7 -2845,7 +2894,7 @@@ int netif_set_xps_queue(struct net_devi
         int ret;
   
         cpus_read_lock();
- -      ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
+ +      ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
         cpus_read_unlock();
   
         return ret;
@@@ -3993,15 -3956,13 +4005,15 @@@ sch_handle_egress(struct sk_buff *skb, 
   static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                                struct xps_dev_maps *dev_maps, unsigned int tci)
   {
+ +      int tc = netdev_get_prio_tc_map(dev, skb->priority);
         struct xps_map *map;
         int queue_index = -1;
   
- -      if (dev->num_tc) {
- -              tci *= dev->num_tc;
- -              tci += netdev_get_prio_tc_map(dev, skb->priority);
- -      }
+ +      if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
+ +              return queue_index;
+ +
+ +      tci *= dev_maps->num_tc;
+ +      tci += tc;
   
         map = rcu_dereference(dev_maps->attr_map[tci]);
         if (map) {
@@@ -4032,18 -3993,18 +4044,18 @@@ static int get_xps_queue(struct net_dev
         if (!static_key_false(&xps_rxqs_needed))
                 goto get_cpus_map;
   
- -      dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
+ +      dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
         if (dev_maps) {
                 int tci = sk_rx_queue_get(sk);
   
- -              if (tci >= 0 && tci < dev->num_rx_queues)
+ +              if (tci >= 0)
                         queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                           tci);
         }
   
   get_cpus_map:
         if (queue_index < 0) {
- -              dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
+ +              dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                 if (dev_maps) {
                         unsigned int tci = skb->sender_cpu - 1;
   
@@@ -4345,6 -4306,13 +4357,13 @@@ static inline void ____napi_schedule(st
                  */
                 thread = READ_ONCE(napi->thread);
                 if (thread) {
+                       /* Avoid doing set_bit() if the thread is in
+                        * INTERRUPTIBLE state, cause napi_thread_wait()
+                        * makes sure to proceed with napi polling
+                        * if the thread is explicitly woken from here.
+                        */
+                       if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+                               set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                         wake_up_process(thread);
                         return;
                 }
@@@ -5316,7 -5284,6 +5335,7 @@@ skip_classify
                         goto another_round;
                 case RX_HANDLER_EXACT:
                         deliver_exact = true;
+ +                      break;
                 case RX_HANDLER_PASS:
                         break;
                 default:
@@@ -5909,13 -5876,15 +5928,13 @@@ void napi_gro_flush(struct napi_struct 
   }
   EXPORT_SYMBOL(napi_gro_flush);
   
- -static struct list_head *gro_list_prepare(struct napi_struct *napi,
- -                                        struct sk_buff *skb)
+ +static void gro_list_prepare(const struct list_head *head,
+ +                           const struct sk_buff *skb)
   {
         unsigned int maclen = skb->dev->hard_header_len;
         u32 hash = skb_get_hash_raw(skb);
- -      struct list_head *head;
         struct sk_buff *p;
   
- -      head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
         list_for_each_entry(p, head, list) {
                 unsigned long diffs;
   
@@@ -5941,6 -5910,8 +5960,6 @@@
                                        maclen);
                 NAPI_GRO_CB(p)->same_flow = !diffs;
         }
- -
- -      return head;
   }
   
   static void skb_gro_reset_offset(struct sk_buff *skb)
@@@ -6003,11 -5974,11 +6022,11 @@@ static void gro_flush_oldest(struct nap
   
   static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
   {
- -      u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ +      u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
+ +      struct gro_list *gro_list = &napi->gro_hash[bucket];
         struct list_head *head = &offload_base;
         struct packet_offload *ptype;
         __be16 type = skb->protocol;
- -      struct list_head *gro_head;
         struct sk_buff *pp = NULL;
         enum gro_result ret;
         int same_flow;
@@@ -6016,7 -5987,7 +6035,7 @@@
         if (netif_elide_gro(skb->dev))
                 goto normal;
   
- -      gro_head = gro_list_prepare(napi, skb);
+ +      gro_list_prepare(&gro_list->list, skb);
   
         rcu_read_lock();
         list_for_each_entry_rcu(ptype, head, list) {
@@@ -6052,7 -6023,7 +6071,7 @@@
   
                 pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
                                         ipv6_gro_receive, inet_gro_receive,
- -                                      gro_head, skb);
+ +                                      &gro_list->list, skb);
                 break;
         }
         rcu_read_unlock();
@@@ -6071,7 -6042,7 +6090,7 @@@
         if (pp) {
                 skb_list_del_init(pp);
                 napi_gro_complete(napi, pp);
- -              napi->gro_hash[hash].count--;
+ +              gro_list->count--;
         }
   
         if (same_flow)
@@@ -6080,16 -6051,16 +6099,16 @@@
         if (NAPI_GRO_CB(skb)->flush)
                 goto normal;
   
- -      if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
- -              gro_flush_oldest(napi, gro_head);
- -      } else {
- -              napi->gro_hash[hash].count++;
- -      }
+ +      if (unlikely(gro_list->count >= MAX_GRO_SKBS))
+ +              gro_flush_oldest(napi, &gro_list->list);
+ +      else
+ +              gro_list->count++;
+ +
         NAPI_GRO_CB(skb)->count = 1;
         NAPI_GRO_CB(skb)->age = jiffies;
         NAPI_GRO_CB(skb)->last = skb;
         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
- -      list_add(&skb->list, gro_head);
+ +      list_add(&skb->list, &gro_list->list);
         ret = GRO_HELD;
   
   pull:
@@@ -6097,11 -6068,11 +6116,11 @@@
         if (grow > 0)
                 gro_pull_from_frag0(skb, grow);
   ok:
- -      if (napi->gro_hash[hash].count) {
- -              if (!test_bit(hash, &napi->gro_bitmask))
- -                      __set_bit(hash, &napi->gro_bitmask);
- -      } else if (test_bit(hash, &napi->gro_bitmask)) {
- -              __clear_bit(hash, &napi->gro_bitmask);
+ +      if (gro_list->count) {
+ +              if (!test_bit(bucket, &napi->gro_bitmask))
+ +                      __set_bit(bucket, &napi->gro_bitmask);
+ +      } else if (test_bit(bucket, &napi->gro_bitmask)) {
+ +              __clear_bit(bucket, &napi->gro_bitmask);
         }
   
         return ret;
@@@ -6534,6 -6505,7 +6553,7 @@@ bool napi_complete_done(struct napi_str
                 WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
   
                 new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+                             NAPIF_STATE_SCHED_THREADED |
                               NAPIF_STATE_PREFER_BUSY_POLL);
   
                 /* If STATE_MISSED was set, leave STATE_SCHED set,
@@@ -6817,7 -6789,6 +6837,7 @@@ int dev_set_threaded(struct net_device 
   
         return err;
   }
+ +EXPORT_SYMBOL(dev_set_threaded);
   
   void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                     int (*poll)(struct napi_struct *, int), int weight)
@@@ -7017,16 -6988,25 +7037,25 @@@ static int napi_poll(struct napi_struc
   
   static int napi_thread_wait(struct napi_struct *napi)
   {
+       bool woken = false;
+ 
         set_current_state(TASK_INTERRUPTIBLE);
   
         while (!kthread_should_stop() && !napi_disable_pending(napi)) {
-               if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+               /* Testing SCHED_THREADED bit here to make sure the current
+                * kthread owns this napi and could poll on this napi.
+                * Testing SCHED bit is not enough because SCHED bit might be
+                * set by some other busy poll thread or by napi_disable().
+                */
+               if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
                         WARN_ON(!list_empty(&napi->poll_list));
                         __set_current_state(TASK_RUNNING);
                         return 0;
                 }
   
                 schedule();
+               /* woken being true indicates this thread owns this napi. */
+               woken = true;
                 set_current_state(TASK_INTERRUPTIBLE);
         }
         __set_current_state(TASK_RUNNING);
@@@ -10356,20 -10336,14 +10385,20 @@@ EXPORT_SYMBOL(register_netdev)
   
   int netdev_refcnt_read(const struct net_device *dev)
   {
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         int i, refcnt = 0;
   
         for_each_possible_cpu(i)
                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
         return refcnt;
+ +#else
+ +      return refcount_read(&dev->dev_refcnt);
+ +#endif
   }
   EXPORT_SYMBOL(netdev_refcnt_read);
   
+ +int netdev_unregister_timeout_secs __read_mostly = 10;
+ +
   #define WAIT_REFS_MIN_MSECS 1
   #define WAIT_REFS_MAX_MSECS 250
   /**
@@@ -10394,7 -10368,7 +10423,7 @@@ static void netdev_wait_allrefs(struct 
         rebroadcast_time = warning_time = jiffies;
         refcnt = netdev_refcnt_read(dev);
   
- -      while (refcnt != 0) {
+ +      while (refcnt != 1) {
                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                         rtnl_lock();
   
@@@ -10431,9 -10405,7 +10460,9 @@@
   
                 refcnt = netdev_refcnt_read(dev);
   
- -              if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
+ +              if (refcnt &&
+ +                  time_after(jiffies, warning_time +
+ +                             netdev_unregister_timeout_secs * HZ)) {
                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                  dev->name, refcnt);
                         warning_time = jiffies;
@@@ -10509,7 -10481,7 +10538,7 @@@ void netdev_run_todo(void
                 netdev_wait_allrefs(dev);
   
                 /* paranoia */
- -              BUG_ON(netdev_refcnt_read(dev));
+ +              BUG_ON(netdev_refcnt_read(dev) != 1);
                 BUG_ON(!list_empty(&dev->ptype_all));
                 BUG_ON(!list_empty(&dev->ptype_specific));
                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
@@@ -10726,14 -10698,9 +10755,14 @@@ struct net_device *alloc_netdev_mqs(in
         dev = PTR_ALIGN(p, NETDEV_ALIGN);
         dev->padded = (char *)dev - (char *)p;
   
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         dev->pcpu_refcnt = alloc_percpu(int);
         if (!dev->pcpu_refcnt)
                 goto free_dev;
+ +      dev_hold(dev);
+ +#else
+ +      refcount_set(&dev->dev_refcnt, 1);
+ +#endif
   
         if (dev_addr_init(dev))
                 goto free_pcpu;
@@@ -10797,10 -10764,8 +10826,10 @@@ free_all
         return NULL;
   
   free_pcpu:
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         free_percpu(dev->pcpu_refcnt);
   free_dev:
+ +#endif
         netdev_freemem(dev);
         return NULL;
   }
@@@ -10842,10 -10807,8 +10871,10 @@@ void free_netdev(struct net_device *dev
         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                 netif_napi_del(p);
   
+ +#ifdef CONFIG_PCPU_DEV_REFCNT
         free_percpu(dev->pcpu_refcnt);
         dev->pcpu_refcnt = NULL;
+ +#endif
         free_percpu(dev->xdp_bulkq);
         dev->xdp_bulkq = NULL;
   
@@@ -11412,7 -11375,7 +11441,7 @@@ static void __net_exit default_device_e
                         continue;
   
                 /* Leave virtual devices for the generic cleanup */
-               if (dev->rtnl_link_ops)
+               if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                         continue;
   
                 /* Push remaining network devices to init_net */
diff --combined net/core/drop_monitor.c

index 1eb02c2,db65ce6..ead2a8a
--- 1/net/core/drop_monitor.c
--- 2/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@@ -1053,6 -1053,20 +1053,20 @@@ static int net_dm_hw_monitor_start(stru
         return 0;
   
   err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+               struct sk_buff *skb;
+ 
+               del_timer_sync(&hw_data->send_timer);
+               cancel_work_sync(&hw_data->dm_alert_work);
+               while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+                       struct devlink_trap_metadata *hw_metadata;
+ 
+                       hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+                       net_dm_hw_metadata_free(hw_metadata);
+                       consume_skb(skb);
+               }
+       }
         module_put(THIS_MODULE);
         return rc;
   }
@@@ -1134,6 -1148,15 +1148,15 @@@ static int net_dm_trace_on_set(struct n
   err_unregister_trace:
         unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
   err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+               struct sk_buff *skb;
+ 
+               del_timer_sync(&data->send_timer);
+               cancel_work_sync(&data->dm_alert_work);
+               while ((skb = __skb_dequeue(&data->drop_queue)))
+                       consume_skb(skb);
+       }
         module_put(THIS_MODULE);
         return rc;
   }
@@@ -1731,7 -1754,7 +1754,7 @@@ static void exit_net_drop_monitor(void
   
         /*
          * Because of the module_get/put we do in the trace state change path
- -       * we are guarnateed not to have any current users when we get here
+ +       * we are guaranteed not to have any current users when we get here
          */
   
         for_each_possible_cpu(cpu) {
diff --combined net/core/filter.c

index b673200,9323d34..f5eeebf
--- 1/net/core/filter.c
--- 2/net/core/filter.c
+++ b/net/core/filter.c
@@@ -1863,7 -1863,10 +1863,7 @@@ static const struct bpf_func_proto bpf_
   static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                            unsigned int write_len)
   {
- -      int err = __bpf_try_make_writable(skb, write_len);
- -
- -      bpf_compute_data_end_sk_skb(skb);
- -      return err;
+ +      return __bpf_try_make_writable(skb, write_len);
   }
   
   BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@@ -3409,7 -3412,6 +3409,7 @@@ static u32 bpf_skb_net_base_len(const s
                                          BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                          BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                          BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
+ +                                       BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                          BPF_F_ADJ_ROOM_ENCAP_L2( \
                                           BPF_ADJ_ROOM_ENCAP_L2_MASK))
   
@@@ -3446,10 -3448,6 +3446,10 @@@ static int bpf_skb_net_grow(struct sk_b
                     flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                         return -EINVAL;
   
+ +              if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
+ +                  inner_mac_len < ETH_HLEN)
+ +                      return -EINVAL;
+ +
                 if (skb->encapsulation)
                         return -EALREADY;
   
@@@ -3468,11 -3466,7 +3468,11 @@@
                 skb->inner_mac_header = inner_net - inner_mac_len;
                 skb->inner_network_header = inner_net;
                 skb->inner_transport_header = inner_trans;
- -              skb_set_inner_protocol(skb, skb->protocol);
+ +
+ +              if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
+ +                      skb_set_inner_protocol(skb, htons(ETH_P_TEB));
+ +              else
+ +                      skb_set_inner_protocol(skb, skb->protocol);
   
                 skb->encapsulation = 1;
                 skb_set_network_header(skb, mac_len);
@@@ -3583,6 -3577,7 +3583,6 @@@ BPF_CALL_4(sk_skb_adjust_room, struct s
                         return -ENOMEM;
                 __skb_pull(skb, len_diff_abs);
         }
- -      bpf_compute_data_end_sk_skb(skb);
         if (tls_sw_has_ctx_rx(skb->sk)) {
                 struct strp_msg *rxm = strp_msg(skb);
   
@@@ -3747,7 -3742,10 +3747,7 @@@ static const struct bpf_func_proto bpf_
   BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
            u64, flags)
   {
- -      int ret = __bpf_skb_change_tail(skb, new_len, flags);
- -
- -      bpf_compute_data_end_sk_skb(skb);
- -      return ret;
+ +      return __bpf_skb_change_tail(skb, new_len, flags);
   }
   
   static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@@ -3810,7 -3808,10 +3810,7 @@@ static const struct bpf_func_proto bpf_
   BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
            u64, flags)
   {
- -      int ret = __bpf_skb_change_head(skb, head_room, flags);
- -
- -      bpf_compute_data_end_sk_skb(skb);
- -      return ret;
+ +      return __bpf_skb_change_head(skb, head_room, flags);
   }
   
   static const struct bpf_func_proto sk_skb_change_head_proto = {
@@@ -3918,6 -3919,23 +3918,6 @@@ static const struct bpf_func_proto bpf_
         .arg2_type      = ARG_ANYTHING,
   };
   
- -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
- -                          struct bpf_map *map, struct xdp_buff *xdp)
- -{
- -      switch (map->map_type) {
- -      case BPF_MAP_TYPE_DEVMAP:
- -      case BPF_MAP_TYPE_DEVMAP_HASH:
- -              return dev_map_enqueue(fwd, xdp, dev_rx);
- -      case BPF_MAP_TYPE_CPUMAP:
- -              return cpu_map_enqueue(fwd, xdp, dev_rx);
- -      case BPF_MAP_TYPE_XSKMAP:
- -              return __xsk_map_redirect(fwd, xdp);
- -      default:
- -              return -EBADRQC;
- -      }
- -      return 0;
- -}
- -
   void xdp_do_flush(void)
   {
         __dev_flush();
@@@ -3926,52 -3944,71 +3926,52 @@@
   }
   EXPORT_SYMBOL_GPL(xdp_do_flush);
   
- -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
- -{
- -      switch (map->map_type) {
- -      case BPF_MAP_TYPE_DEVMAP:
- -              return __dev_map_lookup_elem(map, index);
- -      case BPF_MAP_TYPE_DEVMAP_HASH:
- -              return __dev_map_hash_lookup_elem(map, index);
- -      case BPF_MAP_TYPE_CPUMAP:
- -              return __cpu_map_lookup_elem(map, index);
- -      case BPF_MAP_TYPE_XSKMAP:
- -              return __xsk_map_lookup_elem(map, index);
- -      default:
- -              return NULL;
- -      }
- -}
- -
- -void bpf_clear_redirect_map(struct bpf_map *map)
- -{
- -      struct bpf_redirect_info *ri;
- -      int cpu;
- -
- -      for_each_possible_cpu(cpu) {
- -              ri = per_cpu_ptr(&bpf_redirect_info, cpu);
- -              /* Avoid polluting remote cacheline due to writes if
- -               * not needed. Once we pass this test, we need the
- -               * cmpxchg() to make sure it hasn't been changed in
- -               * the meantime by remote CPU.
- -               */
- -              if (unlikely(READ_ONCE(ri->map) == map))
- -                      cmpxchg(&ri->map, map, NULL);
- -      }
- -}
- -
   int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                     struct bpf_prog *xdp_prog)
   {
         struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- -      struct bpf_map *map = READ_ONCE(ri->map);
- -      u32 index = ri->tgt_index;
+ +      enum bpf_map_type map_type = ri->map_type;
         void *fwd = ri->tgt_value;
+ +      u32 map_id = ri->map_id;
         int err;
   
- -      ri->tgt_index = 0;
- -      ri->tgt_value = NULL;
- -      WRITE_ONCE(ri->map, NULL);
+ +      ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
   
- -      if (unlikely(!map)) {
- -              fwd = dev_get_by_index_rcu(dev_net(dev), index);
- -              if (unlikely(!fwd)) {
- -                      err = -EINVAL;
- -                      goto err;
+ +      switch (map_type) {
+ +      case BPF_MAP_TYPE_DEVMAP:
+ +              fallthrough;
+ +      case BPF_MAP_TYPE_DEVMAP_HASH:
+ +              err = dev_map_enqueue(fwd, xdp, dev);
+ +              break;
+ +      case BPF_MAP_TYPE_CPUMAP:
+ +              err = cpu_map_enqueue(fwd, xdp, dev);
+ +              break;
+ +      case BPF_MAP_TYPE_XSKMAP:
+ +              err = __xsk_map_redirect(fwd, xdp);
+ +              break;
+ +      case BPF_MAP_TYPE_UNSPEC:
+ +              if (map_id == INT_MAX) {
+ +                      fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ +                      if (unlikely(!fwd)) {
+ +                              err = -EINVAL;
+ +                              break;
+ +                      }
+ +                      err = dev_xdp_enqueue(fwd, xdp, dev);
+ +                      break;
                 }
- -
- -              err = dev_xdp_enqueue(fwd, xdp, dev);
- -      } else {
- -              err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
+ +              fallthrough;
+ +      default:
+ +              err = -EBADRQC;
         }
   
         if (unlikely(err))
                 goto err;
   
- -      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ +      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
         return 0;
   err:
- -      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ +      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
         return err;
   }
   EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@@ -3980,36 -4017,41 +3980,36 @@@ static int xdp_do_generic_redirect_map(
                                        struct sk_buff *skb,
                                        struct xdp_buff *xdp,
                                        struct bpf_prog *xdp_prog,
- -                                     struct bpf_map *map)
+ +                                     void *fwd,
+ +                                     enum bpf_map_type map_type, u32 map_id)
   {
         struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- -      u32 index = ri->tgt_index;
- -      void *fwd = ri->tgt_value;
- -      int err = 0;
- -
- -      ri->tgt_index = 0;
- -      ri->tgt_value = NULL;
- -      WRITE_ONCE(ri->map, NULL);
- -
- -      if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
- -          map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
- -              struct bpf_dtab_netdev *dst = fwd;
+ +      int err;
   
- -              err = dev_map_generic_redirect(dst, skb, xdp_prog);
+ +      switch (map_type) {
+ +      case BPF_MAP_TYPE_DEVMAP:
+ +              fallthrough;
+ +      case BPF_MAP_TYPE_DEVMAP_HASH:
+ +              err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                 if (unlikely(err))
                         goto err;
- -      } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
- -              struct xdp_sock *xs = fwd;
- -
- -              err = xsk_generic_rcv(xs, xdp);
+ +              break;
+ +      case BPF_MAP_TYPE_XSKMAP:
+ +              err = xsk_generic_rcv(fwd, xdp);
                 if (err)
                         goto err;
                 consume_skb(skb);
- -      } else {
+ +              break;
+ +      default:
                 /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
                 err = -EBADRQC;
                 goto err;
         }
   
- -      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index);
+ +      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
         return 0;
   err:
- -      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err);
+ +      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
         return err;
   }
   
@@@ -4017,34 -4059,31 +4017,34 @@@ int xdp_do_generic_redirect(struct net_
                             struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
   {
         struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- -      struct bpf_map *map = READ_ONCE(ri->map);
- -      u32 index = ri->tgt_index;
- -      struct net_device *fwd;
- -      int err = 0;
- -
- -      if (map)
- -              return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
- -                                                 map);
- -      ri->tgt_index = 0;
- -      fwd = dev_get_by_index_rcu(dev_net(dev), index);
- -      if (unlikely(!fwd)) {
- -              err = -EINVAL;
- -              goto err;
- -      }
+ +      enum bpf_map_type map_type = ri->map_type;
+ +      void *fwd = ri->tgt_value;
+ +      u32 map_id = ri->map_id;
+ +      int err;
   
- -      err = xdp_ok_fwd_dev(fwd, skb->len);
- -      if (unlikely(err))
- -              goto err;
+ +      ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
+ +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
   
- -      skb->dev = fwd;
- -      _trace_xdp_redirect(dev, xdp_prog, index);
- -      generic_xdp_tx(skb, xdp_prog);
- -      return 0;
+ +      if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
+ +              fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
+ +              if (unlikely(!fwd)) {
+ +                      err = -EINVAL;
+ +                      goto err;
+ +              }
+ +
+ +              err = xdp_ok_fwd_dev(fwd, skb->len);
+ +              if (unlikely(err))
+ +                      goto err;
+ +
+ +              skb->dev = fwd;
+ +              _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
+ +              generic_xdp_tx(skb, xdp_prog);
+ +              return 0;
+ +      }
+ +
+ +      return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
   err:
- -      _trace_xdp_redirect_err(dev, xdp_prog, index, err);
+ +      _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
         return err;
   }
   
@@@ -4055,12 -4094,10 +4055,12 @@@ BPF_CALL_2(bpf_xdp_redirect, u32, ifind
         if (unlikely(flags))
                 return XDP_ABORTED;
   
- -      ri->flags = flags;
+ +      /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
+ +       * by map_idr) is used for ifindex based XDP redirect.
+ +       */
         ri->tgt_index = ifindex;
- -      ri->tgt_value = NULL;
- -      WRITE_ONCE(ri->map, NULL);
+ +      ri->map_id = INT_MAX;
+ +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
   
         return XDP_REDIRECT;
   }
@@@ -4076,7 -4113,28 +4076,7 @@@ static const struct bpf_func_proto bpf_
   BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
            u64, flags)
   {
- -      struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- -
- -      /* Lower bits of the flags are used as return code on lookup failure */
- -      if (unlikely(flags > XDP_TX))
- -              return XDP_ABORTED;
- -
- -      ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
- -      if (unlikely(!ri->tgt_value)) {
- -              /* If the lookup fails we want to clear out the state in the
- -               * redirect_info struct completely, so that if an eBPF program
- -               * performs multiple lookups, the last one always takes
- -               * precedence.
- -               */
- -              WRITE_ONCE(ri->map, NULL);
- -              return flags;
- -      }
- -
- -      ri->flags = flags;
- -      ri->tgt_index = ifindex;
- -      WRITE_ONCE(ri->map, map);
- -
- -      return XDP_REDIRECT;
+ +      return map->ops->map_redirect(map, ifindex, flags);
   }
   
   static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@@ -5600,7 -5658,7 +5600,7 @@@ BPF_CALL_5(bpf_skb_check_mtu, struct sk
         if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                 return -EINVAL;
   
-       if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+       if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                 return -EINVAL;
   
         dev = __dev_via_ifindex(dev, ifindex);
@@@ -5610,7 -5668,11 +5610,11 @@@
         mtu = READ_ONCE(dev->mtu);
   
         dev_len = mtu + dev->hard_header_len;
-       skb_len = skb->len + len_diff; /* minus result pass check */
+ 
+       /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+ 
+       skb_len += len_diff; /* minus result pass check */
         if (skb_len <= dev_len) {
                 ret = BPF_MTU_CHK_RET_SUCCESS;
                 goto out;
@@@ -5655,6 -5717,10 +5659,10 @@@ BPF_CALL_5(bpf_xdp_check_mtu, struct xd
         /* Add L2-header as dev MTU is L3 size */
         dev_len = mtu + dev->hard_header_len;
   
+       /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       if (*mtu_len)
+               xdp_len = *mtu_len + dev->hard_header_len;
+ 
         xdp_len += len_diff; /* minus result pass check */
         if (xdp_len > dev_len)
                 ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
@@@ -9597,40 -9663,22 +9605,40 @@@ static u32 sock_ops_convert_ctx_access(
         return insn - insn_buf;
   }
   
+ +/* data_end = skb->data + skb_headlen() */
+ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
+ +                                                  struct bpf_insn *insn)
+ +{
+ +      /* si->dst_reg = skb->data */
+ +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
+ +                            si->dst_reg, si->src_reg,
+ +                            offsetof(struct sk_buff, data));
+ +      /* AX = skb->len */
+ +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
+ +                            BPF_REG_AX, si->src_reg,
+ +                            offsetof(struct sk_buff, len));
+ +      /* si->dst_reg = skb->data + skb->len */
+ +      *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
+ +      /* AX = skb->data_len */
+ +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
+ +                            BPF_REG_AX, si->src_reg,
+ +                            offsetof(struct sk_buff, data_len));
+ +      /* si->dst_reg = skb->data + skb->len - skb->data_len */
+ +      *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
+ +
+ +      return insn;
+ +}
+ +
   static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                      const struct bpf_insn *si,
                                      struct bpf_insn *insn_buf,
                                      struct bpf_prog *prog, u32 *target_size)
   {
         struct bpf_insn *insn = insn_buf;
- -      int off;
   
         switch (si->off) {
         case offsetof(struct __sk_buff, data_end):
- -              off  = si->off;
- -              off -= offsetof(struct __sk_buff, data_end);
- -              off += offsetof(struct sk_buff, cb);
- -              off += offsetof(struct tcp_skb_cb, bpf.data_end);
- -              *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
- -                                    si->src_reg, off);
+ +              insn = bpf_convert_data_end_access(si, insn);
                 break;
         default:
                 return bpf_convert_ctx_access(type, si, insn_buf, prog,
@@@ -10409,7 -10457,6 +10417,7 @@@ static u32 sk_lookup_convert_ctx_access
   }
   
   const struct bpf_prog_ops sk_lookup_prog_ops = {
+ +      .test_run = bpf_prog_test_run_sk_lookup,
   };
   
   const struct bpf_verifier_ops sk_lookup_verifier_ops = {
diff --combined net/core/flow_dissector.c

index 2ed380d,a96a4f5..5985029
--- 1/net/core/flow_dissector.c
--- 2/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@@ -114,7 -114,7 +114,7 @@@ int flow_dissector_bpf_prog_attach_chec
    * is the protocol port offset returned from proto_ports_offset
    */
   __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
- -                          void *data, int hlen)
+ +                          const void *data, int hlen)
   {
         int poff = proto_ports_offset(ip_proto);
   
@@@ -161,7 -161,7 +161,7 @@@ static bool icmp_has_id(u8 type
    */
   void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                            struct flow_dissector_key_icmp *key_icmp,
- -                         void *data, int thoff, int hlen)
+ +                         const void *data, int thoff, int hlen)
   {
         struct icmphdr *ih, _ih;
   
@@@ -176,7 -176,7 +176,7 @@@
          * avoid confusion with packets without such field
          */
         if (icmp_has_id(ih->type))
-               key_icmp->id = ih->un.echo.id ? : 1;
+               key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
         else
                 key_icmp->id = 0;
   }
@@@ -187,8 -187,8 +187,8 @@@ EXPORT_SYMBOL(skb_flow_get_icmp_tci)
    */
   static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                     struct flow_dissector *flow_dissector,
- -                                  void *target_container,
- -                                  void *data, int thoff, int hlen)
+ +                                  void *target_container, const void *data,
+ +                                  int thoff, int hlen)
   {
         struct flow_dissector_key_icmp *key_icmp;
   
@@@ -409,8 -409,8 +409,8 @@@ EXPORT_SYMBOL(skb_flow_dissect_hash)
   static enum flow_dissect_ret
   __skb_flow_dissect_mpls(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
- -                      void *target_container, void *data, int nhoff, int hlen,
- -                      int lse_index, bool *entropy_label)
+ +                      void *target_container, const void *data, int nhoff,
+ +                      int hlen, int lse_index, bool *entropy_label)
   {
         struct mpls_label *hdr, _hdr;
         u32 entry, label, bos;
@@@ -467,8 -467,7 +467,8 @@@
   static enum flow_dissect_ret
   __skb_flow_dissect_arp(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
- -                     void *target_container, void *data, int nhoff, int hlen)
+ +                     void *target_container, const void *data,
+ +                     int nhoff, int hlen)
   {
         struct flow_dissector_key_arp *key_arp;
         struct {
@@@ -524,7 -523,7 +524,7 @@@ static enum flow_dissect_re
   __skb_flow_dissect_gre(const struct sk_buff *skb,
                        struct flow_dissector_key_control *key_control,
                        struct flow_dissector *flow_dissector,
- -                     void *target_container, void *data,
+ +                     void *target_container, const void *data,
                        __be16 *p_proto, int *p_nhoff, int *p_hlen,
                        unsigned int flags)
   {
@@@ -664,8 -663,8 +664,8 @@@
   static enum flow_dissect_ret
   __skb_flow_dissect_batadv(const struct sk_buff *skb,
                           struct flow_dissector_key_control *key_control,
- -                        void *data, __be16 *p_proto, int *p_nhoff, int hlen,
- -                        unsigned int flags)
+ +                        const void *data, __be16 *p_proto, int *p_nhoff,
+ +                        int hlen, unsigned int flags)
   {
         struct {
                 struct batadv_unicast_packet batadv_unicast;
@@@ -696,8 -695,7 +696,8 @@@
   static void
   __skb_flow_dissect_tcp(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
- -                     void *target_container, void *data, int thoff, int hlen)
+ +                     void *target_container, const void *data,
+ +                     int thoff, int hlen)
   {
         struct flow_dissector_key_tcp *key_tcp;
         struct tcphdr *th, _th;
@@@ -721,8 -719,8 +721,8 @@@
   static void
   __skb_flow_dissect_ports(const struct sk_buff *skb,
                          struct flow_dissector *flow_dissector,
- -                       void *target_container, void *data, int nhoff,
- -                       u8 ip_proto, int hlen)
+ +                       void *target_container, const void *data,
+ +                       int nhoff, u8 ip_proto, int hlen)
   {
         enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
         struct flow_dissector_key_ports *key_ports;
@@@ -746,8 -744,7 +746,8 @@@
   static void
   __skb_flow_dissect_ipv4(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
- -                      void *target_container, void *data, const struct iphdr *iph)
+ +                      void *target_container, const void *data,
+ +                      const struct iphdr *iph)
   {
         struct flow_dissector_key_ip *key_ip;
   
@@@ -764,8 -761,7 +764,8 @@@
   static void
   __skb_flow_dissect_ipv6(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
- -                      void *target_container, void *data, const struct ipv6hdr *iph)
+ +                      void *target_container, const void *data,
+ +                      const struct ipv6hdr *iph)
   {
         struct flow_dissector_key_ip *key_ip;
   
@@@ -912,8 -908,9 +912,8 @@@ bool bpf_flow_dissect(struct bpf_prog *
   bool __skb_flow_dissect(const struct net *net,
                         const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
- -                      void *target_container,
- -                      void *data, __be16 proto, int nhoff, int hlen,
- -                      unsigned int flags)
+ +                      void *target_container, const void *data,
+ +                      __be16 proto, int nhoff, int hlen, unsigned int flags)
   {
         struct flow_dissector_key_control *key_control;
         struct flow_dissector_key_basic *key_basic;
@@@ -1645,7 -1642,7 +1645,7 @@@ __u32 skb_get_hash_perturb(const struc
   }
   EXPORT_SYMBOL(skb_get_hash_perturb);
   
- -u32 __skb_get_poff(const struct sk_buff *skb, void *data,
+ +u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                    const struct flow_keys_basic *keys, int hlen)
   {
         u32 poff = keys->control.thoff;
diff --combined net/ipv4/route.c

index ea916df,bba150f..f6787c5
--- 1/net/ipv4/route.c
--- 2/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@@ -21,7 -21,7 +21,7 @@@
    *            Alan Cox        :       Added BSD route gw semantics
    *            Alan Cox        :       Super /proc >4K
    *            Alan Cox        :       MTU in route table
- - *            Alan Cox        :       MSS actually. Also added the window
+ + *            Alan Cox        :       MSS actually. Also added the window
    *                                    clamper.
    *            Sam Lantinga    :       Fixed route matching in rt_del()
    *            Alan Cox        :       Routing cache support.
@@@ -41,7 -41,7 +41,7 @@@
    *            Olaf Erb        :       irtt wasn't being copied right.
    *            Bjorn Ekwall    :       Kerneld route support.
    *            Alan Cox        :       Multicast fixed (I hope)
- - *            Pavel Krauz     :       Limited broadcast fixed
+ + *            Pavel Krauz     :       Limited broadcast fixed
    *            Mike McLagan    :       Routing by source
    *    Alexey Kuznetsov        :       End of old history. Split to fib.c and
    *                                    route.c and rewritten from scratch.
@@@ -54,8 -54,8 +54,8 @@@
    *    Robert Olsson           :       Added rt_cache statistics
    *    Arnaldo C. Melo         :       Convert proc stuff to seq_file
    *    Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
- - *    Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
- - *    Ilia Sotnikov           :       Removed TOS from hash calculations
+ + *    Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
+ + *    Ilia Sotnikov           :       Removed TOS from hash calculations
    */
   
   #define pr_fmt(fmt) "IPv4: " fmt
@@@ -66,7 -66,6 +66,7 @@@
   #include <linux/types.h>
   #include <linux/kernel.h>
   #include <linux/mm.h>
+ +#include <linux/memblock.h>
   #include <linux/string.h>
   #include <linux/socket.h>
   #include <linux/sockios.h>
@@@ -235,6 -234,19 +235,6 @@@ static const struct seq_operations rt_c
         .show   = rt_cache_seq_show,
   };
   
- -static int rt_cache_seq_open(struct inode *inode, struct file *file)
- -{
- -      return seq_open(file, &rt_cache_seq_ops);
- -}
- -
- -static const struct proc_ops rt_cache_proc_ops = {
- -      .proc_open      = rt_cache_seq_open,
- -      .proc_read      = seq_read,
- -      .proc_lseek     = seq_lseek,
- -      .proc_release   = seq_release,
- -};
- -
- -
   static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
   {
         int cpu;
@@@ -312,6 -324,19 +312,6 @@@ static const struct seq_operations rt_c
         .show   = rt_cpu_seq_show,
   };
   
- -
- -static int rt_cpu_seq_open(struct inode *inode, struct file *file)
- -{
- -      return seq_open(file, &rt_cpu_seq_ops);
- -}
- -
- -static const struct proc_ops rt_cpu_proc_ops = {
- -      .proc_open      = rt_cpu_seq_open,
- -      .proc_read      = seq_read,
- -      .proc_lseek     = seq_lseek,
- -      .proc_release   = seq_release,
- -};
- -
   #ifdef CONFIG_IP_ROUTE_CLASSID
   static int rt_acct_proc_show(struct seq_file *m, void *v)
   {
@@@ -342,13 -367,13 +342,13 @@@ static int __net_init ip_rt_do_proc_ini
   {
         struct proc_dir_entry *pde;
   
- -      pde = proc_create("rt_cache", 0444, net->proc_net,
- -                        &rt_cache_proc_ops);
+ +      pde = proc_create_seq("rt_cache", 0444, net->proc_net,
+ +                            &rt_cache_seq_ops);
         if (!pde)
                 goto err1;
   
- -      pde = proc_create("rt_cache", 0444,
- -                        net->proc_net_stat, &rt_cpu_proc_ops);
+ +      pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
+ +                            &rt_cpu_seq_ops);
         if (!pde)
                 goto err2;
   
@@@ -453,10 -478,8 +453,10 @@@ static void ipv4_confirm_neigh(const st
         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
   }
   
- -#define IP_IDENTS_SZ 2048u
- -
+ +/* Hash tables of size 2048..262144 depending on RAM size.
+ + * Each bucket uses 8 bytes.
+ + */
+ +static u32 ip_idents_mask __read_mostly;
   static atomic_t *ip_idents __read_mostly;
   static u32 *ip_tstamps __read_mostly;
   
@@@ -466,16 -489,12 +466,16 @@@
    */
   u32 ip_idents_reserve(u32 hash, int segs)
   {
- -      u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
- -      atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
- -      u32 old = READ_ONCE(*p_tstamp);
- -      u32 now = (u32)jiffies;
+ +      u32 bucket, old, now = (u32)jiffies;
+ +      atomic_t *p_id;
+ +      u32 *p_tstamp;
         u32 delta = 0;
   
+ +      bucket = hash & ip_idents_mask;
+ +      p_tstamp = ip_tstamps + bucket;
+ +      p_id = ip_idents + bucket;
+ +      old = READ_ONCE(*p_tstamp);
+ +
         if (old != now && cmpxchg(p_tstamp, old, now) == old)
                 delta = prandom_u32_max(now - old);
   
@@@ -703,7 -722,6 +703,7 @@@ static void update_or_create_fnhe(struc
   
                 for_each_possible_cpu(i) {
                         struct rtable __rcu **prt;
+ +
                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
                         rt = rcu_dereference(*prt);
                         if (rt)
@@@ -1240,12 -1258,12 +1240,12 @@@ static int ip_rt_bug(struct net *net, s
   }
   
   /*
- -   We do not cache source address of outgoing interface,
- -   because it is used only by IP RR, TS and SRR options,
- -   so that it out of fast path.
- -
- -   BTW remember: "addr" is allowed to be not aligned
- -   in IP options!
+ + * We do not cache source address of outgoing interface,
+ + * because it is used only by IP RR, TS and SRR options,
+ + * so that it out of fast path.
+ + *
+ + * BTW remember: "addr" is allowed to be not aligned
+ + * in IP options!
    */
   
   void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
@@@ -2090,7 -2108,7 +2090,7 @@@ static int ip_route_input_slow(struct s
                 goto out;
   
         /* Check for the most weird martians, which can be not detected
- -         by fib_lookup.
+ +       * by fib_lookup.
          */
   
         tun_info = skb_tunnel_info(skb);
@@@ -2228,7 -2246,7 +2228,7 @@@ local_input
         if (res->type == RTN_UNREACHABLE) {
                 rth->dst.input= ip_error;
                 rth->dst.error= -err;
- -              rth->rt_flags   &= ~RTCF_LOCAL;
+ +              rth->rt_flags   &= ~RTCF_LOCAL;
         }
   
         if (do_cache) {
@@@ -2299,15 -2317,15 +2299,15 @@@ int ip_route_input_rcu(struct sk_buff *
                        u8 tos, struct net_device *dev, struct fib_result *res)
   {
         /* Multicast recognition logic is moved from route cache to here.
- -         The problem was that too many Ethernet cards have broken/missing
- -         hardware multicast filters :-( As result the host on multicasting
- -         network acquires a lot of useless route cache entries, sort of
- -         SDR messages from all the world. Now we try to get rid of them.
- -         Really, provided software IP multicast filter is organized
- -         reasonably (at least, hashed), it does not result in a slowdown
- -         comparing with route cache reject entries.
- -         Note, that multicast routers are not affected, because
- -         route cache entry is created eventually.
+ +       * The problem was that too many Ethernet cards have broken/missing
+ +       * hardware multicast filters :-( As result the host on multicasting
+ +       * network acquires a lot of useless route cache entries, sort of
+ +       * SDR messages from all the world. Now we try to get rid of them.
+ +       * Really, provided software IP multicast filter is organized
+ +       * reasonably (at least, hashed), it does not result in a slowdown
+ +       * comparing with route cache reject entries.
+ +       * Note, that multicast routers are not affected, because
+ +       * route cache entry is created eventually.
          */
         if (ipv4_is_multicast(daddr)) {
                 struct in_device *in_dev = __in_dev_get_rcu(dev);
@@@ -2519,11 -2537,11 +2519,11 @@@ struct rtable *ip_route_output_key_hash
                 rth = ERR_PTR(-ENETUNREACH);
   
                 /* I removed check for oif == dev_out->oif here.
- -                 It was wrong for two reasons:
- -                 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
- -                    is assigned to multiple interfaces.
- -                 2. Moreover, we are allowed to send packets with saddr
- -                    of another iface. --ANK
+ +               * It was wrong for two reasons:
+ +               * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
+ +               *    is assigned to multiple interfaces.
+ +               * 2. Moreover, we are allowed to send packets with saddr
+ +               *    of another iface. --ANK
                  */
   
                 if (fl4->flowi4_oif == 0 &&
@@@ -2535,18 -2553,18 +2535,18 @@@
                                 goto out;
   
                         /* Special hack: user can direct multicasts
- -                         and limited broadcast via necessary interface
- -                         without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
- -                         This hack is not just for fun, it allows
- -                         vic,vat and friends to work.
- -                         They bind socket to loopback, set ttl to zero
- -                         and expect that it will work.
- -                         From the viewpoint of routing cache they are broken,
- -                         because we are not allowed to build multicast path
- -                         with loopback source addr (look, routing cache
- -                         cannot know, that ttl is zero, so that packet
- -                         will not leave this host and route is valid).
- -                         Luckily, this hack is good workaround.
+ +                       * and limited broadcast via necessary interface
+ +                       * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
+ +                       * This hack is not just for fun, it allows
+ +                       * vic,vat and friends to work.
+ +                       * They bind socket to loopback, set ttl to zero
+ +                       * and expect that it will work.
+ +                       * From the viewpoint of routing cache they are broken,
+ +                       * because we are not allowed to build multicast path
+ +                       * with loopback source addr (look, routing cache
+ +                       * cannot know, that ttl is zero, so that packet
+ +                       * will not leave this host and route is valid).
+ +                       * Luckily, this hack is good workaround.
                          */
   
                         fl4->flowi4_oif = dev_out->ifindex;
@@@ -2609,21 -2627,21 +2609,21 @@@
                     (ipv4_is_multicast(fl4->daddr) ||
                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
                         /* Apparently, routing tables are wrong. Assume,
- -                         that the destination is on link.
- -
- -                         WHY? DW.
- -                         Because we are allowed to send to iface
- -                         even if it has NO routes and NO assigned
- -                         addresses. When oif is specified, routing
- -                         tables are looked up with only one purpose:
- -                         to catch if destination is gatewayed, rather than
- -                         direct. Moreover, if MSG_DONTROUTE is set,
- -                         we send packet, ignoring both routing tables
- -                         and ifaddr state. --ANK
- -
- -
- -                         We could make it even if oif is unknown,
- -                         likely IPv6, but we do not.
+ +                       * that the destination is on link.
+ +                       *
+ +                       * WHY? DW.
+ +                       * Because we are allowed to send to iface
+ +                       * even if it has NO routes and NO assigned
+ +                       * addresses. When oif is specified, routing
+ +                       * tables are looked up with only one purpose:
+ +                       * to catch if destination is gatewayed, rather than
+ +                       * direct. Moreover, if MSG_DONTROUTE is set,
+ +                       * we send packet, ignoring both routing tables
+ +                       * and ifaddr state. --ANK
+ +                       *
+ +                       *
+ +                       * We could make it even if oif is unknown,
+ +                       * likely IPv6, but we do not.
                          */
   
                         if (fl4->saddr == 0)
@@@ -2669,44 -2687,15 +2669,15 @@@ out
         return rth;
   }
   
- static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
- {
-       return NULL;
- }
- 
- static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
- {
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
- 
-       return mtu ? : dst->dev->mtu;
- }
- 
- static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                         struct sk_buff *skb, u32 mtu,
-                                         bool confirm_neigh)
- {
- }
- 
- static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                      struct sk_buff *skb)
- {
- }
- 
- static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
-                                         unsigned long old)
- {
-       return NULL;
- }
- 
   static struct dst_ops ipv4_dst_blackhole_ops = {
-       .family                 =       AF_INET,
-       .check                  =       ipv4_blackhole_dst_check,
-       .mtu                    =       ipv4_blackhole_mtu,
-       .default_advmss         =       ipv4_default_advmss,
-       .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-       .redirect               =       ipv4_rt_blackhole_redirect,
-       .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
-       .neigh_lookup           =       ipv4_neigh_lookup,
+       .family                 = AF_INET,
+       .default_advmss         = ipv4_default_advmss,
+       .neigh_lookup           = ipv4_neigh_lookup,
+       .check                  = dst_blackhole_check,
+       .cow_metrics            = dst_blackhole_cow_metrics,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
   };
   
   struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@@ -3564,25 -3553,18 +3535,25 @@@ struct ip_rt_acct __percpu *ip_rt_acct 
   
   int __init ip_rt_init(void)
   {
+ +      void *idents_hash;
         int cpu;
   
- -      ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
- -                                GFP_KERNEL);
- -      if (!ip_idents)
- -              panic("IP: failed to allocate ip_idents\n");
+ +      /* For modern hosts, this will use 2 MB of memory */
+ +      idents_hash = alloc_large_system_hash("IP idents",
+ +                                            sizeof(*ip_idents) + sizeof(*ip_tstamps),
+ +                                            0,
+ +                                            16, /* one bucket per 64 KB */
+ +                                            HASH_ZERO,
+ +                                            NULL,
+ +                                            &ip_idents_mask,
+ +                                            2048,
+ +                                            256*1024);
+ +
+ +      ip_idents = idents_hash;
   
- -      prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
+ +      prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
   
- -      ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
- -      if (!ip_tstamps)
- -              panic("IP: failed to allocate ip_tstamps\n");
+ +      ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
   
         for_each_possible_cpu(cpu) {
                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --combined net/ipv6/route.c

index 60058f3,1056b02..ebb7519
--- 1/net/ipv6/route.c
--- 2/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@@ -260,34 -260,16 +260,16 @@@ static struct dst_ops ip6_dst_ops_templ
         .confirm_neigh          =       ip6_confirm_neigh,
   };
   
- static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
- {
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
- 
-       return mtu ? : dst->dev->mtu;
- }
- 
- static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                        struct sk_buff *skb, u32 mtu,
-                                        bool confirm_neigh)
- {
- }
- 
- static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                     struct sk_buff *skb)
- {
- }
- 
   static struct dst_ops ip6_dst_blackhole_ops = {
-       .family                 =       AF_INET6,
-       .destroy                =       ip6_dst_destroy,
-       .check                  =       ip6_dst_check,
-       .mtu                    =       ip6_blackhole_mtu,
-       .default_advmss         =       ip6_default_advmss,
-       .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
-       .redirect               =       ip6_rt_blackhole_redirect,
-       .cow_metrics            =       dst_cow_metrics_generic,
-       .neigh_lookup           =       ip6_dst_neigh_lookup,
+       .family                 = AF_INET6,
+       .default_advmss         = ip6_default_advmss,
+       .neigh_lookup           = ip6_dst_neigh_lookup,
+       .check                  = ip6_dst_check,
+       .destroy                = ip6_dst_destroy,
+       .cow_metrics            = dst_cow_metrics_generic,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
   };
   
   static const u32 ip6_template_metrics[RTAX_MAX] = {
@@@ -2378,7 -2360,7 +2360,7 @@@ u32 rt6_multipath_hash(const struct ne
   
                         memset(&hash_keys, 0, sizeof(hash_keys));
   
- -                        if (!flkeys) {
+ +                      if (!flkeys) {
                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
                                 flkeys = &keys;
                         }
@@@ -2518,20 -2500,20 +2500,20 @@@ struct dst_entry *ip6_route_output_flag
                                          struct flowi6 *fl6,
                                          int flags)
   {
- -        struct dst_entry *dst;
- -        struct rt6_info *rt6;
+ +      struct dst_entry *dst;
+ +      struct rt6_info *rt6;
   
- -        rcu_read_lock();
- -        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
- -        rt6 = (struct rt6_info *)dst;
- -        /* For dst cached in uncached_list, refcnt is already taken. */
- -        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
- -                dst = &net->ipv6.ip6_null_entry->dst;
- -                dst_hold(dst);
- -        }
- -        rcu_read_unlock();
+ +      rcu_read_lock();
+ +      dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
+ +      rt6 = (struct rt6_info *)dst;
+ +      /* For dst cached in uncached_list, refcnt is already taken. */
+ +      if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ +              dst = &net->ipv6.ip6_null_entry->dst;
+ +              dst_hold(dst);
+ +      }
+ +      rcu_read_unlock();
   
- -        return dst;
+ +      return dst;
   }
   EXPORT_SYMBOL_GPL(ip6_route_output_flags);
   
diff --combined net/mptcp/options.c

index 5fabf3e,89a4225..2b7eec9
--- 1/net/mptcp/options.c
--- 2/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@@ -26,7 -26,6 +26,7 @@@ static void mptcp_parse_option(const st
         int expected_opsize;
         u8 version;
         u8 flags;
+ +      u8 i;
   
         switch (subtype) {
         case MPTCPOPT_MP_CAPABLE:
@@@ -273,17 -272,14 +273,17 @@@
                 break;
   
         case MPTCPOPT_RM_ADDR:
- -              if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
+ +              if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
+ +                  opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
                         break;
   
                 ptr++;
   
                 mp_opt->rm_addr = 1;
- -              mp_opt->rm_id = *ptr++;
- -              pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
+ +              mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
+ +              for (i = 0; i < mp_opt->rm_list.nr; i++)
+ +                      mp_opt->rm_list.ids[i] = *ptr++;
+ +              pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
                 break;
   
         case MPTCPOPT_MP_PRIO:
@@@ -571,15 -567,15 +571,15 @@@ static bool mptcp_established_options_d
   }
   
   static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                 struct in_addr *addr)
+                                 struct in_addr *addr, u16 port)
   {
         u8 hmac[SHA256_DIGEST_SIZE];
         u8 msg[7];
   
         msg[0] = addr_id;
         memcpy(&msg[1], &addr->s_addr, 4);
-       msg[5] = 0;
-       msg[6] = 0;
+       msg[5] = port >> 8;
+       msg[6] = port & 0xFF;
   
         mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
   
@@@ -588,15 -584,15 +588,15 @@@
   
   #if IS_ENABLED(CONFIG_MPTCP_IPV6)
   static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                  struct in6_addr *addr)
+                                  struct in6_addr *addr, u16 port)
   {
         u8 hmac[SHA256_DIGEST_SIZE];
         u8 msg[19];
   
         msg[0] = addr_id;
         memcpy(&msg[1], &addr->s6_addr, 16);
-       msg[17] = 0;
-       msg[18] = 0;
+       msg[17] = port >> 8;
+       msg[18] = port & 0xFF;
   
         mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
   
@@@ -650,7 -646,8 +650,8 @@@ static bool mptcp_established_options_a
                         opts->ahmac = add_addr_generate_hmac(msk->local_key,
                                                              msk->remote_key,
                                                              opts->addr_id,
-                                                            &opts->addr);
+                                                            &opts->addr,
+                                                            opts->port);
                 }
         }
   #if IS_ENABLED(CONFIG_MPTCP_IPV6)
@@@ -661,7 -658,8 +662,8 @@@
                         opts->ahmac = add_addr6_generate_hmac(msk->local_key,
                                                               msk->remote_key,
                                                               opts->addr_id,
-                                                             &opts->addr6);
+                                                             &opts->addr6,
+                                                             opts->port);
                 }
         }
   #endif
@@@ -678,25 -676,20 +680,25 @@@ static bool mptcp_established_options_r
   {
         struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
         struct mptcp_sock *msk = mptcp_sk(subflow->conn);
- -      u8 rm_id;
+ +      struct mptcp_rm_list rm_list;
+ +      int i, len;
   
         if (!mptcp_pm_should_rm_signal(msk) ||
- -          !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
+ +          !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
                 return false;
   
- -      if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
+ +      len = mptcp_rm_addr_len(&rm_list);
+ +      if (len < 0)
+ +              return false;
+ +      if (remaining < len)
                 return false;
   
- -      *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
+ +      *size = len;
         opts->suboptions |= OPTION_MPTCP_RM_ADDR;
- -      opts->rm_id = rm_id;
+ +      opts->rm_list = rm_list;
   
- -      pr_debug("rm_id=%d", opts->rm_id);
+ +      for (i = 0; i < opts->rm_list.nr; i++)
+ +              pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
   
         return true;
   }
@@@ -971,12 -964,14 +973,14 @@@ static bool add_addr_hmac_valid(struct 
         if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
                 hmac = add_addr_generate_hmac(msk->remote_key,
                                               msk->local_key,
-                                             mp_opt->addr_id, &mp_opt->addr);
+                                             mp_opt->addr_id, &mp_opt->addr,
+                                             mp_opt->port);
   #if IS_ENABLED(CONFIG_MPTCP_IPV6)
         else
                 hmac = add_addr6_generate_hmac(msk->remote_key,
                                                msk->local_key,
-                                              mp_opt->addr_id, &mp_opt->addr6);
+                                              mp_opt->addr_id, &mp_opt->addr6,
+                                              mp_opt->port);
   #endif
   
         pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
@@@ -1047,7 -1042,7 +1051,7 @@@ void mptcp_incoming_options(struct soc
         }
   
         if (mp_opt.rm_addr) {
- -              mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
+ +              mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
                 mp_opt.rm_addr = 0;
         }
   
@@@ -1226,23 -1221,9 +1230,23 @@@ mp_capable_done
         }
   
         if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
+ +              u8 i = 1;
+ +
                 *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
- -                                    TCPOLEN_MPTCP_RM_ADDR_BASE,
- -                                    0, opts->rm_id);
+ +                                    TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
+ +                                    0, opts->rm_list.ids[0]);
+ +
+ +              while (i < opts->rm_list.nr) {
+ +                      u8 id1, id2, id3, id4;
+ +
+ +                      id1 = opts->rm_list.ids[i];
+ +                      id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
+ +                      id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
+ +                      id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
+ +                      put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
+ +                      ptr += 1;
+ +                      i += 4;
+ +              }
         }
   
         if (OPTION_MPTCP_PRIO & opts->suboptions) {
diff --combined net/netfilter/nf_flow_table_core.c

index 8fa7bf9,c77ba86..1bce1d2
--- 1/net/netfilter/nf_flow_table_core.c
--- 2/net/netfilter/nf_flow_table_core.c
+++ b/net/netfilter/nf_flow_table_core.c
@@@ -79,8 -79,11 +79,8 @@@ static int flow_offload_fill_route(stru
                                    enum flow_offload_tuple_dir dir)
   {
         struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
- -      struct dst_entry *other_dst = route->tuple[!dir].dst;
         struct dst_entry *dst = route->tuple[dir].dst;
- -
- -      if (!dst_hold_safe(route->tuple[dir].dst))
- -              return -1;
+ +      int i, j = 0;
   
         switch (flow_tuple->l3proto) {
         case NFPROTO_IPV4:
@@@ -91,46 -94,12 +91,46 @@@
                 break;
         }
   
- -      flow_tuple->iifidx = other_dst->dev->ifindex;
- -      flow_tuple->dst_cache = dst;
+ +      flow_tuple->iifidx = route->tuple[dir].in.ifindex;
+ +      for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
+ +              flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
+ +              flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
+ +              if (route->tuple[dir].in.ingress_vlans & BIT(i))
+ +                      flow_tuple->in_vlan_ingress |= BIT(j);
+ +              j++;
+ +      }
+ +      flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
+ +
+ +      switch (route->tuple[dir].xmit_type) {
+ +      case FLOW_OFFLOAD_XMIT_DIRECT:
+ +              memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
+ +                     ETH_ALEN);
+ +              memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
+ +                     ETH_ALEN);
+ +              flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
+ +              flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
+ +              break;
+ +      case FLOW_OFFLOAD_XMIT_XFRM:
+ +      case FLOW_OFFLOAD_XMIT_NEIGH:
+ +              if (!dst_hold_safe(route->tuple[dir].dst))
+ +                      return -1;
+ +
+ +              flow_tuple->dst_cache = dst;
+ +              break;
+ +      }
+ +      flow_tuple->xmit_type = route->tuple[dir].xmit_type;
   
         return 0;
   }
   
+ +static void nft_flow_dst_release(struct flow_offload *flow,
+ +                               enum flow_offload_tuple_dir dir)
+ +{
+ +      if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
+ +          flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
+ +              dst_release(flow->tuplehash[dir].tuple.dst_cache);
+ +}
+ +
   int flow_offload_route_init(struct flow_offload *flow,
                             const struct nf_flow_route *route)
   {
@@@ -149,7 -118,7 +149,7 @@@
         return 0;
   
   err_route_reply:
- -      dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
+ +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
   
         return err;
   }
@@@ -200,8 -169,8 +200,8 @@@ static void flow_offload_fixup_ct(struc
   
   static void flow_offload_route_release(struct flow_offload *flow)
   {
- -      dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
- -      dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
+ +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
+ +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
   }
   
   void flow_offload_free(struct flow_offload *flow)
@@@ -420,20 -389,29 +420,20 @@@ static void nf_flow_offload_work_gc(str
         queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
   }
   
- -
- -static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
- -                              __be16 port, __be16 new_port)
+ +static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
+ +                               __be16 port, __be16 new_port)
   {
         struct tcphdr *tcph;
   
- -      if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
- -              return -1;
- -
         tcph = (void *)(skb_network_header(skb) + thoff);
         inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
- -
- -      return 0;
   }
   
- -static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
- -                              __be16 port, __be16 new_port)
+ +static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
+ +                               __be16 port, __be16 new_port)
   {
         struct udphdr *udph;
   
- -      if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
- -              return -1;
- -
         udph = (void *)(skb_network_header(skb) + thoff);
         if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
                 inet_proto_csum_replace2(&udph->check, skb, port,
@@@ -441,28 -419,37 +441,28 @@@
                 if (!udph->check)
                         udph->check = CSUM_MANGLED_0;
         }
- -
- -      return 0;
   }
   
- -static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
- -                          u8 protocol, __be16 port, __be16 new_port)
+ +static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
+ +                           u8 protocol, __be16 port, __be16 new_port)
   {
         switch (protocol) {
         case IPPROTO_TCP:
- -              if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
- -                      return NF_DROP;
+ +              nf_flow_nat_port_tcp(skb, thoff, port, new_port);
                 break;
         case IPPROTO_UDP:
- -              if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
- -                      return NF_DROP;
+ +              nf_flow_nat_port_udp(skb, thoff, port, new_port);
                 break;
         }
- -
- -      return 0;
   }
   
- -int nf_flow_snat_port(const struct flow_offload *flow,
- -                    struct sk_buff *skb, unsigned int thoff,
- -                    u8 protocol, enum flow_offload_tuple_dir dir)
+ +void nf_flow_snat_port(const struct flow_offload *flow,
+ +                     struct sk_buff *skb, unsigned int thoff,
+ +                     u8 protocol, enum flow_offload_tuple_dir dir)
   {
         struct flow_ports *hdr;
         __be16 port, new_port;
   
- -      if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- -              return -1;
- -
         hdr = (void *)(skb_network_header(skb) + thoff);
   
         switch (dir) {
@@@ -476,19 -463,24 +476,19 @@@
                 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
                 hdr->dest = new_port;
                 break;
- -      default:
- -              return -1;
         }
   
- -      return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ +      nf_flow_nat_port(skb, thoff, protocol, port, new_port);
   }
   EXPORT_SYMBOL_GPL(nf_flow_snat_port);
   
- -int nf_flow_dnat_port(const struct flow_offload *flow,
- -                    struct sk_buff *skb, unsigned int thoff,
- -                    u8 protocol, enum flow_offload_tuple_dir dir)
+ +void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
+ +                     unsigned int thoff, u8 protocol,
+ +                     enum flow_offload_tuple_dir dir)
   {
         struct flow_ports *hdr;
         __be16 port, new_port;
   
- -      if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
- -              return -1;
- -
         hdr = (void *)(skb_network_header(skb) + thoff);
   
         switch (dir) {
@@@ -502,9 -494,11 +502,9 @@@
                 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
                 hdr->source = new_port;
                 break;
- -      default:
- -              return -1;
         }
   
- -      return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
+ +      nf_flow_nat_port(skb, thoff, protocol, port, new_port);
   }
   EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
   
@@@ -512,7 -506,7 +512,7 @@@ int nf_flow_table_init(struct nf_flowta
   {
         int err;
   
-       INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+       INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
         flow_block_init(&flowtable->flow_block);
         init_rwsem(&flowtable->flow_block_lock);
   
diff --combined net/netfilter/nf_tables_api.c

index bd5e812,f57f1a6..fc2526b
--- 1/net/netfilter/nf_tables_api.c
--- 2/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@@ -900,12 -900,6 +900,12 @@@ static void nf_tables_table_disable(str
         nft_table_disable(net, table, 0);
   }
   
+ +enum {
+ +      NFT_TABLE_STATE_UNCHANGED       = 0,
+ +      NFT_TABLE_STATE_DORMANT,
+ +      NFT_TABLE_STATE_WAKEUP
+ +};
+ +
   static int nf_tables_updtable(struct nft_ctx *ctx)
   {
         struct nft_trans *trans;
@@@ -935,17 -929,19 +935,17 @@@
   
         if ((flags & NFT_TABLE_F_DORMANT) &&
             !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
- -              nft_trans_table_enable(trans) = false;
+ +              nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT;
         } else if (!(flags & NFT_TABLE_F_DORMANT) &&
                    ctx->table->flags & NFT_TABLE_F_DORMANT) {
- -              ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
                 ret = nf_tables_table_enable(ctx->net, ctx->table);
                 if (ret >= 0)
- -                      nft_trans_table_enable(trans) = true;
- -              else
- -                      ctx->table->flags |= NFT_TABLE_F_DORMANT;
+ +                      nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP;
         }
         if (ret < 0)
                 goto err;
   
+ +      nft_trans_table_flags(trans) = flags;
         nft_trans_table_update(trans) = true;
         list_add_tail(&trans->list, &ctx->net->nft.commit_list);
         return 0;
@@@ -6787,6 -6783,9 +6787,9 @@@ static int nft_register_flowtable_net_h
   
         list_for_each_entry(hook, hook_list, list) {
                 list_for_each_entry(ft, &table->flowtables, list) {
+                       if (!nft_is_active_next(net, ft))
+                               continue;
+ 
                         list_for_each_entry(hook2, &ft->hook_list, list) {
                                 if (hook->ops.dev == hook2->ops.dev &&
                                     hook->ops.pf == hook2->ops.pf) {
@@@ -6846,6 -6845,7 +6849,7 @@@ static int nft_flowtable_update(struct 
         struct nft_hook *hook, *next;
         struct nft_trans *trans;
         bool unregister = false;
+       u32 flags;
         int err;
   
         err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
@@@ -6860,6 -6860,17 +6864,17 @@@
                 }
         }
   
+       if (nla[NFTA_FLOWTABLE_FLAGS]) {
+               flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+               if (flags & ~NFT_FLOWTABLE_MASK)
+                       return -EOPNOTSUPP;
+               if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+                   (flags & NFT_FLOWTABLE_HW_OFFLOAD))
+                       return -EOPNOTSUPP;
+       } else {
+               flags = flowtable->data.flags;
+       }
+ 
         err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
                                                &flowtable_hook.list, flowtable);
         if (err < 0)
@@@ -6873,6 -6884,7 +6888,7 @@@
                 goto err_flowtable_update_hook;
         }
   
+       nft_trans_flowtable_flags(trans) = flags;
         nft_trans_flowtable(trans) = flowtable;
         nft_trans_flowtable_update(trans) = true;
         INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
@@@ -6967,8 -6979,10 +6983,10 @@@ static int nf_tables_newflowtable(struc
         if (nla[NFTA_FLOWTABLE_FLAGS]) {
                 flowtable->data.flags =
                         ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
-               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
+               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+                       err = -EOPNOTSUPP;
                         goto err3;
+               }
         }
   
         write_pnet(&flowtable->data.net, net);
@@@ -8072,10 -8086,11 +8090,10 @@@ static int nf_tables_commit(struct net 
                 switch (trans->msg_type) {
                 case NFT_MSG_NEWTABLE:
                         if (nft_trans_table_update(trans)) {
- -                              if (!nft_trans_table_enable(trans)) {
- -                                      nf_tables_table_disable(net,
- -                                                              trans->ctx.table);
- -                                      trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- -                              }
+ +                              if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT)
+ +                                      nf_tables_table_disable(net, trans->ctx.table);
+ +
+ +                              trans->ctx.table->flags = nft_trans_table_flags(trans);
                         } else {
                                 nft_clear(net, trans->ctx.table);
                         }
@@@ -8179,6 -8194,8 +8197,8 @@@
                         break;
                 case NFT_MSG_NEWFLOWTABLE:
                         if (nft_trans_flowtable_update(trans)) {
+                               nft_trans_flowtable(trans)->data.flags =
+                                       nft_trans_flowtable_flags(trans);
                                 nf_tables_flowtable_notify(&trans->ctx,
                                                            nft_trans_flowtable(trans),
                                                            &nft_trans_flowtable_hooks(trans),
@@@ -8286,9 -8303,11 +8306,9 @@@ static int __nf_tables_abort(struct ne
                 switch (trans->msg_type) {
                 case NFT_MSG_NEWTABLE:
                         if (nft_trans_table_update(trans)) {
- -                              if (nft_trans_table_enable(trans)) {
- -                                      nf_tables_table_disable(net,
- -                                                              trans->ctx.table);
- -                                      trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
- -                              }
+ +                              if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP)
+ +                                      nf_tables_table_disable(net, trans->ctx.table);
+ +
                                 nft_trans_destroy(trans);
                         } else {
                                 list_del_rcu(&trans->ctx.table->list);
@@@ -8558,7 -8577,6 +8578,7 @@@ static int nf_tables_check_loops(const 
                                                         data->verdict.chain);
                                 if (err < 0)
                                         return err;
+ +                              break;
                         default:
                                 break;
                         }
diff --combined net/sched/cls_api.c

index ca8e177,13341e7..d3db708
--- 1/net/sched/cls_api.c
--- 2/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@@ -1629,6 -1629,7 +1629,7 @@@ int tcf_classify_ingress(struct sk_buf
                         return TC_ACT_SHOT;
                 ext->chain = last_executed_chain;
                 ext->mru = qdisc_skb_cb(skb)->mru;
+               ext->post_ct = qdisc_skb_cb(skb)->post_ct;
         }
   
         return ret;
@@@ -3661,9 -3662,6 +3662,9 @@@ int tc_setup_flow_action(struct flow_ac
                         entry->police.burst = tcf_police_burst(act);
                         entry->police.rate_bytes_ps =
                                 tcf_police_rate_bytes_ps(act);
+ +                      entry->police.burst_pkt = tcf_police_burst_pkt(act);
+ +                      entry->police.rate_pkt_ps =
+ +                              tcf_police_rate_pkt_ps(act);
                         entry->police.mtu = tcf_police_tcfp_mtu(act);
                         entry->police.index = act->tcfa_index;
                 } else if (is_tcf_ct(act)) {
diff --combined net/sched/cls_flower.c

index 9736df9,c69a4ba..d7869a9
--- 1/net/sched/cls_flower.c
--- 2/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@@ -209,16 -209,16 +209,16 @@@ static bool fl_range_port_dst_cmp(struc
                                   struct fl_flow_key *key,
                                   struct fl_flow_key *mkey)
   {
- -      __be16 min_mask, max_mask, min_val, max_val;
+ +      u16 min_mask, max_mask, min_val, max_val;
   
- -      min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
- -      max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
- -      min_val = htons(filter->key.tp_range.tp_min.dst);
- -      max_val = htons(filter->key.tp_range.tp_max.dst);
+ +      min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
+ +      max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
+ +      min_val = ntohs(filter->key.tp_range.tp_min.dst);
+ +      max_val = ntohs(filter->key.tp_range.tp_max.dst);
   
         if (min_mask && max_mask) {
- -              if (htons(key->tp_range.tp.dst) < min_val ||
- -                  htons(key->tp_range.tp.dst) > max_val)
+ +              if (ntohs(key->tp_range.tp.dst) < min_val ||
+ +                  ntohs(key->tp_range.tp.dst) > max_val)
                         return false;
   
                 /* skb does not have min and max values */
@@@ -232,16 -232,16 +232,16 @@@ static bool fl_range_port_src_cmp(struc
                                   struct fl_flow_key *key,
                                   struct fl_flow_key *mkey)
   {
- -      __be16 min_mask, max_mask, min_val, max_val;
+ +      u16 min_mask, max_mask, min_val, max_val;
   
- -      min_mask = htons(filter->mask->key.tp_range.tp_min.src);
- -      max_mask = htons(filter->mask->key.tp_range.tp_max.src);
- -      min_val = htons(filter->key.tp_range.tp_min.src);
- -      max_val = htons(filter->key.tp_range.tp_max.src);
+ +      min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
+ +      max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
+ +      min_val = ntohs(filter->key.tp_range.tp_min.src);
+ +      max_val = ntohs(filter->key.tp_range.tp_max.src);
   
         if (min_mask && max_mask) {
- -              if (htons(key->tp_range.tp.src) < min_val ||
- -                  htons(key->tp_range.tp.src) > max_val)
+ +              if (ntohs(key->tp_range.tp.src) < min_val ||
+ +                  ntohs(key->tp_range.tp.src) > max_val)
                         return false;
   
                 /* skb does not have min and max values */
@@@ -783,16 -783,16 +783,16 @@@ static int fl_set_key_port_range(struc
                        TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
   
         if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
- -          htons(key->tp_range.tp_max.dst) <=
- -          htons(key->tp_range.tp_min.dst)) {
+ +          ntohs(key->tp_range.tp_max.dst) <=
+ +          ntohs(key->tp_range.tp_min.dst)) {
                 NL_SET_ERR_MSG_ATTR(extack,
                                     tb[TCA_FLOWER_KEY_PORT_DST_MIN],
                                     "Invalid destination port range (min must be strictly smaller than max)");
                 return -EINVAL;
         }
         if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
- -          htons(key->tp_range.tp_max.src) <=
- -          htons(key->tp_range.tp_min.src)) {
+ +          ntohs(key->tp_range.tp_max.src) <=
+ +          ntohs(key->tp_range.tp_min.src)) {
                 NL_SET_ERR_MSG_ATTR(extack,
                                     tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
                                     "Invalid source port range (min must be strictly smaller than max)");
@@@ -1044,8 -1044,8 +1044,8 @@@ static int fl_set_key_flags(struct nlat
                 return -EINVAL;
         }
   
- -      key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
- -      mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
+ +      key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS]));
+ +      mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
   
         *flags_key  = 0;
         *flags_mask = 0;
@@@ -1451,7 -1451,7 +1451,7 @@@ static int fl_set_key_ct(struct nlattr 
                                &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
                                sizeof(key->ct_state));
   
-               err = fl_validate_ct_state(mask->ct_state,
+               err = fl_validate_ct_state(key->ct_state & mask->ct_state,
                                            tb[TCA_FLOWER_KEY_CT_STATE_MASK],
                                            extack);
                 if (err)
diff --combined net/tipc/node.c

index 0daf3be,136338b..61c38ea
--- 1/net/tipc/node.c
--- 2/net/tipc/node.c
+++ b/net/tipc/node.c
@@@ -372,49 -372,42 +372,49 @@@ static struct tipc_node *tipc_node_find
   }
   
   static void tipc_node_read_lock(struct tipc_node *n)
+ +      __acquires(n->lock)
   {
         read_lock_bh(&n->lock);
   }
   
   static void tipc_node_read_unlock(struct tipc_node *n)
+ +      __releases(n->lock)
   {
         read_unlock_bh(&n->lock);
   }
   
   static void tipc_node_write_lock(struct tipc_node *n)
+ +      __acquires(n->lock)
   {
         write_lock_bh(&n->lock);
   }
   
   static void tipc_node_write_unlock_fast(struct tipc_node *n)
+ +      __releases(n->lock)
   {
         write_unlock_bh(&n->lock);
   }
   
   static void tipc_node_write_unlock(struct tipc_node *n)
+ +      __releases(n->lock)
   {
+ +      struct tipc_socket_addr sk;
         struct net *net = n->net;
- -      u32 addr = 0;
         u32 flags = n->action_flags;
- -      u32 link_id = 0;
- -      u32 bearer_id;
         struct list_head *publ_list;
+ +      struct tipc_uaddr ua;
+ +      u32 bearer_id;
   
         if (likely(!flags)) {
                 write_unlock_bh(&n->lock);
                 return;
         }
   
- -      addr = n->addr;
- -      link_id = n->link_id;
- -      bearer_id = link_id & 0xffff;
+ +      tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
+ +                 TIPC_LINK_STATE, n->addr, n->addr);
+ +      sk.ref = n->link_id;
+ +      sk.node = n->addr;
+ +      bearer_id = n->link_id & 0xffff;
         publ_list = &n->publ_list;
   
         n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
@@@ -423,18 -416,20 +423,18 @@@
         write_unlock_bh(&n->lock);
   
         if (flags & TIPC_NOTIFY_NODE_DOWN)
- -              tipc_publ_notify(net, publ_list, addr, n->capabilities);
+ +              tipc_publ_notify(net, publ_list, n->addr, n->capabilities);
   
         if (flags & TIPC_NOTIFY_NODE_UP)
- -              tipc_named_node_up(net, addr, n->capabilities);
+ +              tipc_named_node_up(net, n->addr, n->capabilities);
   
         if (flags & TIPC_NOTIFY_LINK_UP) {
- -              tipc_mon_peer_up(net, addr, bearer_id);
- -              tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
- -                                   TIPC_NODE_SCOPE, link_id, link_id);
+ +              tipc_mon_peer_up(net, n->addr, bearer_id);
+ +              tipc_nametbl_publish(net, &ua, &sk, n->link_id);
         }
         if (flags & TIPC_NOTIFY_LINK_DOWN) {
- -              tipc_mon_peer_down(net, addr, bearer_id);
- -              tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
- -                                    addr, link_id);
+ +              tipc_mon_peer_down(net, n->addr, bearer_id);
+ +              tipc_nametbl_withdraw(net, &ua, &sk, n->link_id);
         }
   }
   
@@@ -2900,17 -2895,22 +2900,22 @@@ int tipc_nl_node_dump_monitor_peer(stru
   
   #ifdef CONFIG_TIPC_CRYPTO
   static int tipc_nl_retrieve_key(struct nlattr **attrs,
-                               struct tipc_aead_key **key)
+                               struct tipc_aead_key **pkey)
   {
         struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+       struct tipc_aead_key *key;
   
         if (!attr)
                 return -ENODATA;
   
-       *key = (struct tipc_aead_key *)nla_data(attr);
-       if (nla_len(attr) < tipc_aead_key_size(*key))
+       if (nla_len(attr) < sizeof(*key))
+               return -EINVAL;
+       key = (struct tipc_aead_key *)nla_data(attr);
+       if (key->keylen > TIPC_AEAD_KEYLEN_MAX ||
+           nla_len(attr) < tipc_aead_key_size(key))
                 return -EINVAL;
   
+       *pkey = key;
         return 0;
   }
   
diff --combined tools/lib/bpf/Makefile

index 8170f88,e9eb6a6..87b14b7
--- 1/tools/lib/bpf/Makefile
--- 2/tools/lib/bpf/Makefile
+++ b/tools/lib/bpf/Makefile
@@@ -158,7 -158,7 +158,7 @@@ $(BPF_IN_STATIC): force $(BPF_HELPER_DE
         $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
   
   $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
- -      $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \
+ +      $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \
                 --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS)
   
   $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
@@@ -215,7 -215,7 +215,7 @@@ define do_instal
         if [ ! -d '$(DESTDIR_SQ)$2' ]; then             \
                 $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \
         fi;                                             \
-       $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2'
+       $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
   endef
   
   install_lib: all_cmd
diff --combined tools/lib/bpf/btf_dump.c

index 5e957fc,0911aea..7b53a48
--- 1/tools/lib/bpf/btf_dump.c
--- 2/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@@ -279,7 -279,6 +279,7 @@@ static int btf_dump_mark_referenced(str
                 case BTF_KIND_INT:
                 case BTF_KIND_ENUM:
                 case BTF_KIND_FWD:
+ +              case BTF_KIND_FLOAT:
                         break;
   
                 case BTF_KIND_VOLATILE:
@@@ -454,7 -453,6 +454,7 @@@ static int btf_dump_order_type(struct b
   
         switch (btf_kind(t)) {
         case BTF_KIND_INT:
+ +      case BTF_KIND_FLOAT:
                 tstate->order_state = ORDERED;
                 return 0;
   
@@@ -464,7 -462,7 +464,7 @@@
                 return err;
   
         case BTF_KIND_ARRAY:
-               return btf_dump_order_type(d, btf_array(t)->type, through_ptr);
+               return btf_dump_order_type(d, btf_array(t)->type, false);
   
         case BTF_KIND_STRUCT:
         case BTF_KIND_UNION: {
@@@ -1135,7 -1133,6 +1135,7 @@@ skip_mod
                 case BTF_KIND_STRUCT:
                 case BTF_KIND_UNION:
                 case BTF_KIND_TYPEDEF:
+ +              case BTF_KIND_FLOAT:
                         goto done;
                 default:
                         pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n",
@@@ -1250,7 -1247,6 +1250,7 @@@ static void btf_dump_emit_type_chain(st
   
                 switch (kind) {
                 case BTF_KIND_INT:
+ +              case BTF_KIND_FLOAT:
                         btf_dump_emit_mods(d, decls);
                         name = btf_name_of(d, t->name_off);
                         btf_dump_printf(d, "%s", name);
diff --combined tools/lib/bpf/libbpf.c

index 2f351d3,4181d17..8d61025
--- 1/tools/lib/bpf/libbpf.c
--- 2/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@@ -178,8 -178,6 +178,8 @@@ enum kern_feature_id 
         FEAT_PROG_BIND_MAP,
         /* Kernel support for module BTFs */
         FEAT_MODULE_BTF,
+ +      /* BTF_KIND_FLOAT support */
+ +      FEAT_BTF_FLOAT,
         __FEAT_CNT,
   };
   
@@@ -190,7 -188,6 +190,7 @@@ enum reloc_type 
         RELO_CALL,
         RELO_DATA,
         RELO_EXTERN,
+ +      RELO_SUBPROG_ADDR,
   };
   
   struct reloc_desc {
@@@ -577,16 -574,6 +577,16 @@@ static bool insn_is_subprog_call(const 
                insn->off == 0;
   }
   
+ +static bool is_ldimm64(struct bpf_insn *insn)
+ +{
+ +      return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
+ +}
+ +
+ +static bool insn_is_pseudo_func(struct bpf_insn *insn)
+ +{
+ +      return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
+ +}
+ +
   static int
   bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog,
                       const char *name, size_t sec_idx, const char *sec_name,
@@@ -1194,7 -1181,8 +1194,8 @@@ static int bpf_object__elf_init(struct 
         if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) {
                 pr_warn("elf: failed to get section names strings from %s: %s\n",
                         obj->path, elf_errmsg(-1));
-               return -LIBBPF_ERRNO__FORMAT;
+               err = -LIBBPF_ERRNO__FORMAT;
+               goto errout;
         }
   
         /* Old LLVM set e_machine to EM_NONE */
@@@ -1948,7 -1936,6 +1949,7 @@@ static const char *btf_kind_str(const s
         case BTF_KIND_FUNC_PROTO: return "func_proto";
         case BTF_KIND_VAR: return "var";
         case BTF_KIND_DATASEC: return "datasec";
+ +      case BTF_KIND_FLOAT: return "float";
         default: return "unknown";
         }
   }
@@@ -2398,17 -2385,15 +2399,17 @@@ static bool btf_needs_sanitization(stru
   {
         bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
         bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+ +      bool has_float = kernel_supports(FEAT_BTF_FLOAT);
         bool has_func = kernel_supports(FEAT_BTF_FUNC);
   
- -      return !has_func || !has_datasec || !has_func_global;
+ +      return !has_func || !has_datasec || !has_func_global || !has_float;
   }
   
   static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
   {
         bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
         bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
+ +      bool has_float = kernel_supports(FEAT_BTF_FLOAT);
         bool has_func = kernel_supports(FEAT_BTF_FUNC);
         struct btf_type *t;
         int i, j, vlen;
@@@ -2461,13 -2446,6 +2462,13 @@@
                 } else if (!has_func_global && btf_is_func(t)) {
                         /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */
                         t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0);
+ +              } else if (!has_float && btf_is_float(t)) {
+ +                      /* replace FLOAT with an equally-sized empty STRUCT;
+ +                       * since C compilers do not accept e.g. "float" as a
+ +                       * valid struct name, make it anonymous
+ +                       */
+ +                      t->name_off = 0;
+ +                      t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0);
                 }
         }
   }
@@@ -2997,23 -2975,6 +2998,23 @@@ static bool sym_is_extern(const GElf_Sy
                GELF_ST_TYPE(sym->st_info) == STT_NOTYPE;
   }
   
+ +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
+ +{
+ +      int bind = GELF_ST_BIND(sym->st_info);
+ +      int type = GELF_ST_TYPE(sym->st_info);
+ +
+ +      /* in .text section */
+ +      if (sym->st_shndx != text_shndx)
+ +              return false;
+ +
+ +      /* local function */
+ +      if (bind == STB_LOCAL && type == STT_SECTION)
+ +              return true;
+ +
+ +      /* global function */
+ +      return bind == STB_GLOBAL && type == STT_FUNC;
+ +}
+ +
   static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
   {
         const struct btf_type *t;
@@@ -3435,7 -3396,7 +3436,7 @@@ static int bpf_program__record_reloc(st
                 return 0;
         }
   
- -      if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) {
+ +      if (!is_ldimm64(insn)) {
                 pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
                         prog->name, sym_name, insn_idx, insn->code);
                 return -LIBBPF_ERRNO__RELOC;
@@@ -3470,23 -3431,6 +3471,23 @@@
                 return -LIBBPF_ERRNO__RELOC;
         }
   
+ +      /* loading subprog addresses */
+ +      if (sym_is_subprog(sym, obj->efile.text_shndx)) {
+ +              /* global_func: sym->st_value = offset in the section, insn->imm = 0.
+ +               * local_func: sym->st_value = 0, insn->imm = offset in the section.
+ +               */
+ +              if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) {
+ +                      pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n",
+ +                              prog->name, sym_name, (size_t)sym->st_value, insn->imm);
+ +                      return -LIBBPF_ERRNO__RELOC;
+ +              }
+ +
+ +              reloc_desc->type = RELO_SUBPROG_ADDR;
+ +              reloc_desc->insn_idx = insn_idx;
+ +              reloc_desc->sym_off = sym->st_value;
+ +              return 0;
+ +      }
+ +
         type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
         sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
   
@@@ -3939,18 -3883,6 +3940,18 @@@ static int probe_kern_btf_datasec(void
                                              strs, sizeof(strs)));
   }
   
+ +static int probe_kern_btf_float(void)
+ +{
+ +      static const char strs[] = "\0float";
+ +      __u32 types[] = {
+ +              /* float */
+ +              BTF_TYPE_FLOAT_ENC(1, 4),
+ +      };
+ +
+ +      return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
+ +                                           strs, sizeof(strs)));
+ +}
+ +
   static int probe_kern_array_mmap(void)
   {
         struct bpf_create_map_attr attr = {
@@@ -4130,9 -4062,6 +4131,9 @@@ static struct kern_feature_desc 
         [FEAT_MODULE_BTF] = {
                 "module BTF support", probe_module_btf,
         },
+ +      [FEAT_BTF_FLOAT] = {
+ +              "BTF_KIND_FLOAT support", probe_kern_btf_float,
+ +      },
   };
   
   static bool kernel_supports(enum kern_feature_id feat_id)
@@@ -5638,6 -5567,11 +5639,6 @@@ static void bpf_core_poison_insn(struc
         insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
   }
   
- -static bool is_ldimm64(struct bpf_insn *insn)
- -{
- -      return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
- -}
- -
   static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
   {
         switch (BPF_SIZE(insn->code)) {
@@@ -6239,10 -6173,6 +6240,10 @@@ bpf_object__relocate_data(struct bpf_ob
                         }
                         relo->processed = true;
                         break;
+ +              case RELO_SUBPROG_ADDR:
+ +                      insn[0].src_reg = BPF_PSEUDO_FUNC;
+ +                      /* will be handled as a follow up pass */
+ +                      break;
                 case RELO_CALL:
                         /* will be handled as a follow up pass */
                         break;
@@@ -6429,11 -6359,11 +6430,11 @@@ bpf_object__reloc_code(struct bpf_objec
   
         for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) {
                 insn = &main_prog->insns[prog->sub_insn_off + insn_idx];
- -              if (!insn_is_subprog_call(insn))
+ +              if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn))
                         continue;
   
                 relo = find_prog_insn_relo(prog, insn_idx);
- -              if (relo && relo->type != RELO_CALL) {
+ +              if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) {
                         pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n",
                                 prog->name, insn_idx, relo->type);
                         return -LIBBPF_ERRNO__RELOC;
@@@ -6445,22 -6375,8 +6446,22 @@@
                          * call always has imm = -1, but for static functions
                          * relocation is against STT_SECTION and insn->imm
                          * points to a start of a static function
+ +                       *
+ +                       * for subprog addr relocation, the relo->sym_off + insn->imm is
+ +                       * the byte offset in the corresponding section.
                          */
- -                      sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+ +                      if (relo->type == RELO_CALL)
+ +                              sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
+ +                      else
+ +                              sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ;
+ +              } else if (insn_is_pseudo_func(insn)) {
+ +                      /*
+ +                       * RELO_SUBPROG_ADDR relo is always emitted even if both
+ +                       * functions are in the same section, so it shouldn't reach here.
+ +                       */
+ +                      pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n",
+ +                              prog->name, insn_idx);
+ +                      return -LIBBPF_ERRNO__RELOC;
                 } else {
                         /* if subprogram call is to a static function within
                          * the same ELF section, there won't be any relocation
diff --combined tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c

index 12b40dc,3ac0c9a..8aaa24a
--- 1/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
--- 2/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
+++ b/tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
@@@ -174,6 -174,12 +174,12 @@@ struct struct_in_struct 
         };
   };
   
+ struct struct_in_array {};
+ 
+ struct struct_in_array_typed {};
+ 
+ typedef struct struct_in_array_typed struct_in_array_t[2];
+ 
   struct struct_with_embedded_stuff {
         int a;
         struct {
@@@ -203,14 -209,10 +209,16 @@@
         } r[5];
         struct struct_in_struct s[10];
         int t[11];
+       struct struct_in_array (*u)[2];
+       struct_in_array_t *v;
   };
   
+ +struct float_struct {
+ +      float f;
+ +      const double *d;
+ +      volatile long double *ld;
+ +};
+ +
   struct root_struct {
         enum e1 _1;
         enum e2 _2;
@@@ -225,7 -227,6 +233,7 @@@
         union_fwd_t *_12;
         union_fwd_ptr_t _13;
         struct struct_with_embedded_stuff _14;
+ +      struct float_struct _15;
   };
   
   /* ------ END-EXPECTED-OUTPUT ------ */
diff --combined tools/testing/selftests/net/mptcp/mptcp_join.sh

index 191303b,ad32240..fe990d8
--- 1/tools/testing/selftests/net/mptcp/mptcp_join.sh
--- 2/tools/testing/selftests/net/mptcp/mptcp_join.sh
+++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh
@@@ -11,6 -11,7 +11,7 @@@ ksft_skip=
   timeout=30
   mptcp_connect=""
   capture=0
+ do_all_tests=1
   
   TEST_COUNT=0
   
@@@ -121,12 -122,6 +122,6 @@@ reset_with_add_addr_timeout(
                 -j DROP
   }
   
- for arg in "$@"; do
-       if [ "$arg" = "-c" ]; then
-               capture=1
-       fi
- done
- 
   ip -Version > /dev/null 2>&1
   if [ $? -ne 0 ];then
         echo "SKIP: Could not run test without ip tool"
@@@ -284,19 -279,14 +279,19 @@@ do_transfer(
                 let rm_nr_ns1=-addr_nr_ns1
                 if [ $rm_nr_ns1 -lt 8 ]; then
                         counter=1
- -                      sleep 1
- -
- -                      while [ $counter -le $rm_nr_ns1 ]
- -                      do
- -                              ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
+ +                      dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
+ +                      if [ ${#dump[@]} -gt 0 ]; then
+ +                              id=${dump[1]}
                                 sleep 1
- -                              let counter+=1
- -                      done
+ +
+ +                              while [ $counter -le $rm_nr_ns1 ]
+ +                              do
+ +                                      ip netns exec ${listener_ns} ./pm_nl_ctl del $id
+ +                                      sleep 1
+ +                                      let counter+=1
+ +                                      let id+=1
+ +                              done
+ +                      fi
                 else
                         sleep 1
                         ip netns exec ${listener_ns} ./pm_nl_ctl flush
@@@ -323,19 -313,14 +318,19 @@@
                 let rm_nr_ns2=-addr_nr_ns2
                 if [ $rm_nr_ns2 -lt 8 ]; then
                         counter=1
- -                      sleep 1
- -
- -                      while [ $counter -le $rm_nr_ns2 ]
- -                      do
- -                              ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
+ +                      dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
+ +                      if [ ${#dump[@]} -gt 0 ]; then
+ +                              id=${dump[1]}
                                 sleep 1
- -                              let counter+=1
- -                      done
+ +
+ +                              while [ $counter -le $rm_nr_ns2 ]
+ +                              do
+ +                                      ip netns exec ${connector_ns} ./pm_nl_ctl del $id
+ +                                      sleep 1
+ +                                      let counter+=1
+ +                                      let id+=1
+ +                              done
+ +                      fi
                 else
                         sleep 1
                         ip netns exec ${connector_ns} ./pm_nl_ctl flush
@@@ -620,22 -605,11 +615,22 @@@ chk_rm_nr(
   {
         local rm_addr_nr=$1
         local rm_subflow_nr=$2
+ +      local invert=${3:-""}
         local count
         local dump_stats
+ +      local addr_ns
+ +      local subflow_ns
+ +
+ +      if [ -z $invert ]; then
+ +              addr_ns=$ns1
+ +              subflow_ns=$ns2
+ +      elif [ $invert = "invert" ]; then
+ +              addr_ns=$ns2
+ +              subflow_ns=$ns1
+ +      fi
   
         printf "%-39s %s" " " "rm "
- -      count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
+ +      count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
         [ -z "$count" ] && count=0
         if [ "$count" != "$rm_addr_nr" ]; then
                 echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr"
@@@ -646,7 -620,7 +641,7 @@@
         fi
   
         echo -n " - sf    "
- -      count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
+ +      count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
         [ -z "$count" ] && count=0
         if [ "$count" != "$rm_subflow_nr" ]; then
                 echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr"
@@@ -854,7 -828,7 +849,7 @@@ remove_tests(
         run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
         chk_join_nr "remove single address" 1 1 1
         chk_add_nr 1 1
- -      chk_rm_nr 0 0
+ +      chk_rm_nr 1 1 invert
   
         # subflow and signal, remove
         reset
@@@ -890,29 -864,6 +885,29 @@@
         chk_join_nr "flush subflows and signal" 3 3 3
         chk_add_nr 1 1
         chk_rm_nr 2 2
+ +
+ +      # subflows flush
+ +      reset
+ +      ip netns exec $ns1 ./pm_nl_ctl limits 3 3
+ +      ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+ +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150
+ +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
+ +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
+ +      run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+ +      chk_join_nr "flush subflows" 3 3 3
+ +      chk_rm_nr 3 3
+ +
+ +      # addresses flush
+ +      reset
+ +      ip netns exec $ns1 ./pm_nl_ctl limits 3 3
+ +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250
+ +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
+ +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
+ +      ip netns exec $ns2 ./pm_nl_ctl limits 3 3
+ +      run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
+ +      chk_join_nr "flush addresses" 3 3 3
+ +      chk_add_nr 3 3
+ +      chk_rm_nr 3 3 invert
   }
   
   add_tests()
@@@ -989,7 -940,7 +984,7 @@@ ipv6_tests(
         run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow
         chk_join_nr "remove single address IPv6" 1 1 1
         chk_add_nr 1 1
- -      chk_rm_nr 0 0
+ +      chk_rm_nr 1 1 invert
   
         # subflow and signal IPv6, remove
         reset
@@@ -1132,7 -1083,7 +1127,7 @@@ add_addr_ports_tests(
         run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
         chk_join_nr "remove single address with port" 1 1 1
         chk_add_nr 1 1 1
- -      chk_rm_nr 0 0
+ +      chk_rm_nr 1 1 invert
   
         # subflow and signal with port, remove
         reset
@@@ -1265,7 -1216,8 +1260,8 @@@ usage(
         echo "  -4 v4mapped_tests"
         echo "  -b backup_tests"
         echo "  -p add_addr_ports_tests"
-       echo "  -c syncookies_tests"
+       echo "  -k syncookies_tests"
+       echo "  -c capture pcap files"
         echo "  -h help"
   }
   
@@@ -1279,12 -1231,24 +1275,24 @@@ make_file "$cin" "client" 
   make_file "$sin" "server" 1
   trap cleanup EXIT
   
- if [ -z $1 ]; then
+ for arg in "$@"; do
+       # check for "capture" arg before launching tests
+       if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then
+               capture=1
+       fi
+ 
+       # exception for the capture option, the rest means: a part of the tests
+       if [ "${arg}" != "-c" ]; then
+               do_all_tests=0
+       fi
+ done
+ 
+ if [ $do_all_tests -eq 1 ]; then
         all_tests
         exit $ret
   fi
   
- while getopts 'fsltra64bpch' opt; do
+ while getopts 'fsltra64bpkch' opt; do
         case $opt in
                 f)
                         subflows_tests
@@@ -1316,9 -1280,11 +1324,11 @@@
                 p)
                         add_addr_ports_tests
                         ;;
-               c)
+               k)
                         syncookies_tests
                         ;;
+               c)
+                       ;;
                 h | *)
                         usage
                         ;;
author	David S. Miller <davem@davemloft.net>
	Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
committer	David S. Miller <davem@davemloft.net>
	Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
		1	2
MAINTAINERS	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/atm/fore200e.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/dsa/b53/b53_common.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/dsa/bcm_sf2.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/dsa/mt7530.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/e1000e/netdev.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/i40e/i40e_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/i40e/i40e_txrx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/ice/ice_txrx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/ice/ice_xsk.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/igb/igb_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/igb/igb_ptp.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/igc/igc_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/af/rvu.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en.h	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/pensando/ionic/ionic_txrx.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ethernet/realtek/r8169_main.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ipa/ipa_cmd.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/ipa/ipa_qmi.c	patch \|	diff1 \|	diff2 \|	blob \| history
drivers/net/phy/phylink.c	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/bpf.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/netdevice.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/linux/skbuff.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/netfilter/nf_tables.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/net/nexthop.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/uapi/linux/bpf.h	patch \|	diff1 \|	diff2 \|	blob \| history
include/uapi/linux/psample.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/bpf_inode_storage.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/bpf/verifier.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/fork.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/dev.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/drop_monitor.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/filter.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/core/flow_dissector.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv4/route.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/ipv6/route.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/mptcp/options.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/netfilter/nf_flow_table_core.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/netfilter/nf_tables_api.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sched/cls_api.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/sched/cls_flower.c	patch \|	diff1 \|	diff2 \|	blob \| history
net/tipc/node.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/lib/bpf/Makefile	patch \|	diff1 \|	diff2 \|	blob \| history
tools/lib/bpf/btf_dump.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/lib/bpf/libbpf.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c	patch \|	diff1 \|	diff2 \|	blob \| history
tools/testing/selftests/net/mptcp/mptcp_join.sh	patch \|	diff1 \|	diff2 \|	blob \| history