Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
authorDavid S. Miller <davem@davemloft.net>
Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 25 Mar 2021 22:31:22 +0000 (15:31 -0700)
Signed-off-by: David S. Miller <davem@davemloft.net>
63 files changed:
1  2 
MAINTAINERS
drivers/atm/fore200e.c
drivers/net/dsa/b53/b53_common.c
drivers/net/dsa/bcm_sf2.c
drivers/net/dsa/mt7530.c
drivers/net/ethernet/intel/e1000e/netdev.c
drivers/net/ethernet/intel/i40e/i40e_main.c
drivers/net/ethernet/intel/i40e/i40e_txrx.c
drivers/net/ethernet/intel/ice/ice_txrx.c
drivers/net/ethernet/intel/ice/ice_xsk.c
drivers/net/ethernet/intel/igb/igb_main.c
drivers/net/ethernet/intel/igb/igb_ptp.c
drivers/net/ethernet/intel/igc/igc_main.c
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
drivers/net/ethernet/marvell/octeontx2/af/rvu.h
drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c
drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_flows.c
drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c
drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun_encap.c
drivers/net/ethernet/mellanox/mlx5/core/en_main.c
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/ipoib/ipoib.c
drivers/net/ethernet/mellanox/mlx5/core/sf/hw_table.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_ste_v1.c
drivers/net/ethernet/pensando/ionic/ionic_txrx.c
drivers/net/ethernet/realtek/r8169_main.c
drivers/net/ipa/ipa_cmd.c
drivers/net/ipa/ipa_qmi.c
drivers/net/phy/phylink.c
include/linux/bpf.h
include/linux/netdevice.h
include/linux/skbuff.h
include/net/netfilter/nf_tables.h
include/net/nexthop.h
include/uapi/linux/bpf.h
include/uapi/linux/psample.h
init/Kconfig
kernel/bpf/bpf_inode_storage.c
kernel/bpf/verifier.c
kernel/fork.c
net/core/dev.c
net/core/drop_monitor.c
net/core/filter.c
net/core/flow_dissector.c
net/ipv4/route.c
net/ipv6/route.c
net/mptcp/options.c
net/netfilter/nf_flow_table_core.c
net/netfilter/nf_tables_api.c
net/sched/cls_api.c
net/sched/cls_flower.c
net/tipc/node.c
tools/lib/bpf/Makefile
tools/lib/bpf/btf_dump.c
tools/lib/bpf/libbpf.c
tools/testing/selftests/bpf/progs/btf_dump_test_case_syntax.c
tools/testing/selftests/net/mptcp/mptcp_join.sh

diff --combined MAINTAINERS
@@@ -261,8 -261,8 +261,8 @@@ ABI/AP
  L:    linux-api@vger.kernel.org
  F:    include/linux/syscalls.h
  F:    kernel/sys_ni.c
F:    include/uapi/
F:    arch/*/include/uapi/
X:    include/uapi/
X:    arch/*/include/uapi/
  
  ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
  M:    Hans de Goede <hdegoede@redhat.com>
@@@ -1181,7 -1181,7 +1181,7 @@@ M:      Joel Fernandes <joel@joelfernandes.o
  M:    Christian Brauner <christian@brauner.io>
  M:    Hridya Valsaraju <hridya@google.com>
  M:    Suren Baghdasaryan <surenb@google.com>
- L:    devel@driverdev.osuosl.org
+ L:    linux-kernel@vger.kernel.org
  S:    Supported
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
  F:    drivers/android/
@@@ -1530,7 -1530,6 +1530,7 @@@ F:      Documentation/devicetree/bindings/dm
  F:    Documentation/devicetree/bindings/i2c/i2c-owl.yaml
  F:    Documentation/devicetree/bindings/interrupt-controller/actions,owl-sirq.yaml
  F:    Documentation/devicetree/bindings/mmc/owl-mmc.yaml
 +F:    Documentation/devicetree/bindings/net/actions,owl-emac.yaml
  F:    Documentation/devicetree/bindings/pinctrl/actions,*
  F:    Documentation/devicetree/bindings/power/actions,owl-sps.txt
  F:    Documentation/devicetree/bindings/timer/actions,owl-timer.txt
@@@ -1543,7 -1542,6 +1543,7 @@@ F:      drivers/dma/owl-dma.
  F:    drivers/i2c/busses/i2c-owl.c
  F:    drivers/irqchip/irq-owl-sirq.c
  F:    drivers/mmc/host/owl-mmc.c
 +F:    drivers/net/ethernet/actions/
  F:    drivers/pinctrl/actions/*
  F:    drivers/soc/actions/
  F:    include/dt-bindings/power/owl-*
@@@ -3235,7 -3233,6 +3235,7 @@@ T:      git git://git.kernel.org/pub/scm/lin
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next.git
  F:    Documentation/bpf/
  F:    Documentation/networking/filter.rst
 +F:    Documentation/userspace-api/ebpf/
  F:    arch/*/net/*
  F:    include/linux/bpf*
  F:    include/linux/filter.h
@@@ -3250,7 -3247,6 +3250,7 @@@ F:      net/core/filter.
  F:    net/sched/act_bpf.c
  F:    net/sched/cls_bpf.c
  F:    samples/bpf/
 +F:    scripts/bpf_doc.py
  F:    tools/bpf/
  F:    tools/lib/bpf/
  F:    tools/testing/selftests/bpf/
@@@ -5475,11 -5471,11 +5475,11 @@@ F:   drivers/net/ethernet/freescale/dpaa2
  F:    drivers/net/ethernet/freescale/dpaa2/dpni*
  
  DPAA2 ETHERNET SWITCH DRIVER
 -M:    Ioana Radulescu <ruxandra.radulescu@nxp.com>
  M:    Ioana Ciornei <ioana.ciornei@nxp.com>
 -L:    linux-kernel@vger.kernel.org
 +L:    netdev@vger.kernel.org
  S:    Maintained
 -F:    drivers/staging/fsl-dpaa2/ethsw
 +F:    drivers/net/ethernet/freescale/dpaa2/dpaa2-switch*
 +F:    drivers/net/ethernet/freescale/dpaa2/dpsw*
  
  DPT_I2O SCSI RAID DRIVER
  M:    Adaptec OEM Raid Solutions <aacraid@microsemi.com>
@@@ -5839,7 -5835,7 +5839,7 @@@ M:      David Airlie <airlied@linux.ie
  M:    Daniel Vetter <daniel@ffwll.ch>
  L:    dri-devel@lists.freedesktop.org
  S:    Maintained
- B:    https://bugs.freedesktop.org/
+ B:    https://gitlab.freedesktop.org/drm
  C:    irc://chat.freenode.net/dri-devel
  T:    git git://anongit.freedesktop.org/drm/drm
  F:    Documentation/devicetree/bindings/display/
@@@ -8120,7 -8116,6 +8120,6 @@@ F:      drivers/crypto/hisilicon/sec2/sec_ma
  
  HISILICON STAGING DRIVERS FOR HIKEY 960/970
  M:    Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
- L:    devel@driverdev.osuosl.org
  S:    Maintained
  F:    drivers/staging/hikey9xx/
  
@@@ -8525,6 -8520,7 +8524,7 @@@ IBM Power SRIOV Virtual NIC Device Driv
  M:    Dany Madden <drt@linux.ibm.com>
  M:    Lijun Pan <ljp@linux.ibm.com>
  M:    Sukadev Bhattiprolu <sukadev@linux.ibm.com>
+ R:    Thomas Falcon <tlfalcon@linux.ibm.com>
  L:    netdev@vger.kernel.org
  S:    Supported
  F:    drivers/net/ethernet/ibm/ibmvnic.*
@@@ -12542,7 -12538,7 +12542,7 @@@ NETWORKING [MPTCP
  M:    Mat Martineau <mathew.j.martineau@linux.intel.com>
  M:    Matthieu Baerts <matthieu.baerts@tessares.net>
  L:    netdev@vger.kernel.org
- L:    mptcp@lists.01.org
+ L:    mptcp@lists.linux.dev
  S:    Maintained
  W:    https://github.com/multipath-tcp/mptcp_net-next/wiki
  B:    https://github.com/multipath-tcp/mptcp_net-next/issues
@@@ -14713,15 -14709,11 +14713,11 @@@ F:        drivers/net/ethernet/qlogic/qlcnic
  QLOGIC QLGE 10Gb ETHERNET DRIVER
  M:    Manish Chopra <manishc@marvell.com>
  M:    GR-Linux-NIC-Dev@marvell.com
- L:    netdev@vger.kernel.org
- S:    Supported
- F:    drivers/staging/qlge/
- QLOGIC QLGE 10Gb ETHERNET DRIVER
  M:    Coiby Xu <coiby.xu@gmail.com>
  L:    netdev@vger.kernel.org
- S:    Maintained
+ S:    Supported
  F:    Documentation/networking/device_drivers/qlogic/qlge.rst
+ F:    drivers/staging/qlge/
  
  QM1D1B0004 MEDIA DRIVER
  M:    Akihiro Tsukada <tskd08@gmail.com>
@@@ -16891,8 -16883,10 +16887,10 @@@ F: tools/spi
  
  SPIDERNET NETWORK DRIVER for CELL
  M:    Ishizaki Kou <kou.ishizaki@toshiba.co.jp>
+ M:    Geoff Levand <geoff@infradead.org>
  L:    netdev@vger.kernel.org
- S:    Supported
+ L:    linuxppc-dev@lists.ozlabs.org
+ S:    Maintained
  F:    Documentation/networking/device_drivers/ethernet/toshiba/spider_net.rst
  F:    drivers/net/ethernet/toshiba/spider_net*
  
@@@ -17044,7 -17038,7 +17042,7 @@@ F:   drivers/staging/vt665?
  
  STAGING SUBSYSTEM
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L:    devel@driverdev.osuosl.org
+ L:    linux-staging@lists.linux.dev
  S:    Supported
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/staging.git
  F:    drivers/staging/
@@@ -19139,7 -19133,7 +19137,7 @@@ VME SUBSYSTE
  M:    Martyn Welch <martyn@welchs.me.uk>
  M:    Manohar Vanga <manohar.vanga@gmail.com>
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
- L:    devel@driverdev.osuosl.org
+ L:    linux-kernel@vger.kernel.org
  S:    Maintained
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/char-misc.git
  F:    Documentation/driver-api/vme.rst
@@@ -19170,7 -19164,7 +19168,7 @@@ S:   Maintaine
  F:    drivers/infiniband/hw/vmw_pvrdma/
  
  VMware PVSCSI driver
- M:    Jim Gill <jgill@vmware.com>
+ M:    Vishal Bhakta <vbhakta@vmware.com>
  M:    VMware PV-Drivers <pv-drivers@vmware.com>
  L:    linux-scsi@vger.kernel.org
  S:    Maintained
diff --combined drivers/atm/fore200e.c
@@@ -21,6 -21,7 +21,6 @@@
  #include <linux/module.h>
  #include <linux/atmdev.h>
  #include <linux/sonet.h>
 -#include <linux/atm_suni.h>
  #include <linux/dma-mapping.h>
  #include <linux/delay.h>
  #include <linux/firmware.h>
@@@ -99,8 -100,6 +99,6 @@@ static LIST_HEAD(fore200e_boards)
  
  MODULE_AUTHOR("Christophe Lizzi - credits to Uwe Dannowski and Heikki Vatiainen");
  MODULE_DESCRIPTION("FORE Systems 200E-series ATM driver - version " FORE200E_VERSION);
- MODULE_SUPPORTED_DEVICE("PCA-200E, SBA-200E");
  
  static const int fore200e_rx_buf_nbr[ BUFFER_SCHEME_NBR ][ BUFFER_MAGN_NBR ] = {
      { BUFFER_S1_NBR, BUFFER_L1_NBR },
@@@ -349,7 -349,7 +349,7 @@@ static void b53_set_forwarding(struct b
        b53_write8(dev, B53_CTRL_PAGE, B53_IP_MULTICAST_CTRL, mgmt);
  }
  
 -static void b53_enable_vlan(struct b53_device *dev, bool enable,
 +static void b53_enable_vlan(struct b53_device *dev, int port, bool enable,
                            bool enable_filtering)
  {
        u8 mgmt, vc0, vc1, vc4 = 0, vc5;
        b53_write8(dev, B53_CTRL_PAGE, B53_SWITCH_MODE, mgmt);
  
        dev->vlan_enabled = enable;
 +
 +      dev_dbg(dev->dev, "Port %d VLAN enabled: %d, filtering: %d\n",
 +              port, enable, enable_filtering);
  }
  
  static int b53_set_jumbo(struct b53_device *dev, bool enable, bool allow_10_100)
@@@ -746,7 -743,7 +746,7 @@@ int b53_configure_vlan(struct dsa_switc
                b53_do_vlan_op(dev, VTA_CMD_CLEAR);
        }
  
 -      b53_enable_vlan(dev, dev->vlan_enabled, ds->vlan_filtering);
 +      b53_enable_vlan(dev, -1, dev->vlan_enabled, ds->vlan_filtering);
  
        b53_for_each_port(dev, i)
                b53_write16(dev, B53_VLAN_PAGE,
@@@ -1108,13 -1105,6 +1108,6 @@@ static int b53_setup(struct dsa_switch 
                        b53_disable_port(ds, port);
        }
  
-       /* Let DSA handle the case were multiple bridges span the same switch
-        * device and different VLAN awareness settings are requested, which
-        * would be breaking filtering semantics for any of the other bridge
-        * devices. (not hardware supported)
-        */
-       ds->vlan_filtering_is_global = true;
        return b53_setup_devlink_resources(ds);
  }
  
@@@ -1432,7 -1422,7 +1425,7 @@@ int b53_vlan_filtering(struct dsa_switc
  {
        struct b53_device *dev = ds->priv;
  
 -      b53_enable_vlan(dev, dev->vlan_enabled, vlan_filtering);
 +      b53_enable_vlan(dev, port, dev->vlan_enabled, vlan_filtering);
  
        return 0;
  }
@@@ -1457,7 -1447,7 +1450,7 @@@ static int b53_vlan_prepare(struct dsa_
        if (vlan->vid >= dev->num_vlans)
                return -ERANGE;
  
 -      b53_enable_vlan(dev, true, ds->vlan_filtering);
 +      b53_enable_vlan(dev, port, true, ds->vlan_filtering);
  
        return 0;
  }
@@@ -2055,17 -2045,15 +2048,17 @@@ enum dsa_tag_protocol b53_get_tag_proto
  {
        struct b53_device *dev = ds->priv;
  
 -      /* Older models (5325, 5365) support a different tag format that we do
 -       * not support in net/dsa/tag_brcm.c yet.
 -       */
 -      if (is5325(dev) || is5365(dev) ||
 -          !b53_can_enable_brcm_tags(ds, port, mprot)) {
 +      if (!b53_can_enable_brcm_tags(ds, port, mprot)) {
                dev->tag_protocol = DSA_TAG_PROTO_NONE;
                goto out;
        }
  
 +      /* Older models require a different 6 byte tag */
 +      if (is5325(dev) || is5365(dev) || is63xx(dev)) {
 +              dev->tag_protocol = DSA_TAG_PROTO_BRCM_LEGACY;
 +              goto out;
 +      }
 +
        /* Broadcom BCM58xx chips have a flow accelerator on Port 8
         * which requires us to use the prepended Broadcom tag type
         */
@@@ -2669,6 -2657,13 +2662,13 @@@ struct b53_device *b53_switch_alloc(str
        ds->ops = &b53_switch_ops;
        ds->untag_bridge_pvid = true;
        dev->vlan_enabled = true;
+       /* Let DSA handle the case were multiple bridges span the same switch
+        * device and different VLAN awareness settings are requested, which
+        * would be breaking filtering semantics for any of the other bridge
+        * devices. (not hardware supported)
+        */
+       ds->vlan_filtering_is_global = true;
        mutex_init(&dev->reg_mutex);
        mutex_init(&dev->stats_mutex);
  
  #include "b53/b53_priv.h"
  #include "b53/b53_regs.h"
  
 +static u16 bcm_sf2_reg_rgmii_cntrl(struct bcm_sf2_priv *priv, int port)
 +{
 +      switch (priv->type) {
 +      case BCM4908_DEVICE_ID:
 +              switch (port) {
 +              case 7:
 +                      return REG_RGMII_11_CNTRL;
 +              default:
 +                      break;
 +              }
 +              break;
 +      default:
 +              switch (port) {
 +              case 0:
 +                      return REG_RGMII_0_CNTRL;
 +              case 1:
 +                      return REG_RGMII_1_CNTRL;
 +              case 2:
 +                      return REG_RGMII_2_CNTRL;
 +              default:
 +                      break;
 +              }
 +      }
 +
 +      WARN_ONCE(1, "Unsupported port %d\n", port);
 +
 +      /* RO fallback reg */
 +      return REG_SWITCH_STATUS;
 +}
 +
  /* Return the number of active ports, not counting the IMP (CPU) port */
  static unsigned int bcm_sf2_num_active_ports(struct dsa_switch *ds)
  {
@@@ -144,7 -114,10 +144,10 @@@ static void bcm_sf2_imp_setup(struct ds
                /* Force link status for IMP port */
                reg = core_readl(priv, offset);
                reg |= (MII_SW_OR | LINK_STS);
-               reg &= ~GMII_SPEED_UP_2G;
+               if (priv->type == BCM4908_DEVICE_ID)
+                       reg |= GMII_SPEED_UP_2G;
+               else
+                       reg &= ~GMII_SPEED_UP_2G;
                core_writel(priv, reg, offset);
  
                /* Enable Broadcast, Multicast, Unicast forwarding to IMP port */
@@@ -462,44 -435,6 +465,44 @@@ static int bcm_sf2_sw_rst(struct bcm_sf
        return 0;
  }
  
 +static void bcm_sf2_crossbar_setup(struct bcm_sf2_priv *priv)
 +{
 +      struct device *dev = priv->dev->ds->dev;
 +      int shift;
 +      u32 mask;
 +      u32 reg;
 +      int i;
 +
 +      mask = BIT(priv->num_crossbar_int_ports) - 1;
 +
 +      reg = reg_readl(priv, REG_CROSSBAR);
 +      switch (priv->type) {
 +      case BCM4908_DEVICE_ID:
 +              shift = CROSSBAR_BCM4908_INT_P7 * priv->num_crossbar_int_ports;
 +              reg &= ~(mask << shift);
 +              if (0) /* FIXME */
 +                      reg |= CROSSBAR_BCM4908_EXT_SERDES << shift;
 +              else if (priv->int_phy_mask & BIT(7))
 +                      reg |= CROSSBAR_BCM4908_EXT_GPHY4 << shift;
 +              else if (phy_interface_mode_is_rgmii(priv->port_sts[7].mode))
 +                      reg |= CROSSBAR_BCM4908_EXT_RGMII << shift;
 +              else if (WARN(1, "Invalid port mode\n"))
 +                      return;
 +              break;
 +      default:
 +              return;
 +      }
 +      reg_writel(priv, reg, REG_CROSSBAR);
 +
 +      reg = reg_readl(priv, REG_CROSSBAR);
 +      for (i = 0; i < priv->num_crossbar_int_ports; i++) {
 +              shift = i * priv->num_crossbar_int_ports;
 +
 +              dev_dbg(dev, "crossbar int port #%d - ext port #%d\n", i,
 +                      (reg >> shift) & mask);
 +      }
 +}
 +
  static void bcm_sf2_intr_disable(struct bcm_sf2_priv *priv)
  {
        intrl2_0_mask_set(priv, 0xffffffff);
  static void bcm_sf2_identify_ports(struct bcm_sf2_priv *priv,
                                   struct device_node *dn)
  {
 +      struct device *dev = priv->dev->ds->dev;
 +      struct bcm_sf2_port_status *port_st;
        struct device_node *port;
        unsigned int port_num;
        struct property *prop;
 -      phy_interface_t mode;
        int err;
  
        priv->moca_port = -1;
                if (of_property_read_u32(port, "reg", &port_num))
                        continue;
  
 +              if (port_num >= DSA_MAX_PORTS) {
 +                      dev_err(dev, "Invalid port number %d\n", port_num);
 +                      continue;
 +              }
 +
 +              port_st = &priv->port_sts[port_num];
 +
                /* Internal PHYs get assigned a specific 'phy-mode' property
                 * value: "internal" to help flag them before MDIO probing
                 * has completed, since they might be turned off at that
                 * time
                 */
 -              err = of_get_phy_mode(port, &mode);
 +              err = of_get_phy_mode(port, &port_st->mode);
                if (err)
                        continue;
  
 -              if (mode == PHY_INTERFACE_MODE_INTERNAL)
 +              if (port_st->mode == PHY_INTERFACE_MODE_INTERNAL)
                        priv->int_phy_mask |= 1 << port_num;
  
 -              if (mode == PHY_INTERFACE_MODE_MOCA)
 +              if (port_st->mode == PHY_INTERFACE_MODE_MOCA)
                        priv->moca_port = port_num;
  
                if (of_property_read_bool(port, "brcm,use-bcm-hdr"))
@@@ -661,8 -588,10 +664,10 @@@ static u32 bcm_sf2_sw_get_phy_flags(str
         * in bits 15:8 and the patch level in bits 7:0 which is exactly what
         * the REG_PHY_REVISION register layout is.
         */
-       return priv->hw_params.gphy_rev;
+       if (priv->int_phy_mask & BIT(port))
+               return priv->hw_params.gphy_rev;
+       else
+               return 0;
  }
  
  static void bcm_sf2_sw_validate(struct dsa_switch *ds, int port,
@@@ -718,7 -647,6 +723,7 @@@ static void bcm_sf2_sw_mac_config(struc
  {
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        u32 id_mode_dis = 0, port_mode;
 +      u32 reg_rgmii_ctrl;
        u32 reg;
  
        if (port == core_readl(priv, CORE_IMP0_PRT_ID))
                return;
        }
  
 +      reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
 +
        /* Clear id_mode_dis bit, and the existing port mode, let
         * RGMII_MODE_EN bet set by mac_link_{up,down}
         */
 -      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
 +      reg = reg_readl(priv, reg_rgmii_ctrl);
        reg &= ~ID_MODE_DIS;
        reg &= ~(PORT_MODE_MASK << PORT_MODE_SHIFT);
  
        if (id_mode_dis)
                reg |= ID_MODE_DIS;
  
 -      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
 +      reg_writel(priv, reg, reg_rgmii_ctrl);
  }
  
  static void bcm_sf2_sw_mac_link_set(struct dsa_switch *ds, int port,
                                    phy_interface_t interface, bool link)
  {
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
 +      u32 reg_rgmii_ctrl;
        u32 reg;
  
        if (!phy_interface_mode_is_rgmii(interface) &&
            interface != PHY_INTERFACE_MODE_REVMII)
                return;
  
 +      reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
 +
        /* If the link is down, just disable the interface to conserve power */
 -      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
 +      reg = reg_readl(priv, reg_rgmii_ctrl);
        if (link)
                reg |= RGMII_MODE_EN;
        else
                reg &= ~RGMII_MODE_EN;
 -      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
 +      reg_writel(priv, reg, reg_rgmii_ctrl);
  }
  
  static void bcm_sf2_sw_mac_link_down(struct dsa_switch *ds, int port,
@@@ -812,15 -735,11 +817,15 @@@ static void bcm_sf2_sw_mac_link_up(stru
  {
        struct bcm_sf2_priv *priv = bcm_sf2_to_priv(ds);
        struct ethtool_eee *p = &priv->dev->ports[port].eee;
 -      u32 reg, offset;
  
        bcm_sf2_sw_mac_link_set(ds, port, interface, true);
  
        if (port != core_readl(priv, CORE_IMP0_PRT_ID)) {
 +              u32 reg_rgmii_ctrl;
 +              u32 reg, offset;
 +
 +              reg_rgmii_ctrl = bcm_sf2_reg_rgmii_cntrl(priv, port);
 +
                if (priv->type == BCM4908_DEVICE_ID ||
                    priv->type == BCM7445_DEVICE_ID)
                        offset = CORE_STS_OVERRIDE_GMIIP_PORT(port);
                    interface == PHY_INTERFACE_MODE_RGMII_TXID ||
                    interface == PHY_INTERFACE_MODE_MII ||
                    interface == PHY_INTERFACE_MODE_REVMII) {
 -                      reg = reg_readl(priv, REG_RGMII_CNTRL_P(port));
 +                      reg = reg_readl(priv, reg_rgmii_ctrl);
                        reg &= ~(RX_PAUSE_EN | TX_PAUSE_EN);
  
                        if (tx_pause)
                        if (rx_pause)
                                reg |= RX_PAUSE_EN;
  
 -                      reg_writel(priv, reg, REG_RGMII_CNTRL_P(port));
 +                      reg_writel(priv, reg, reg_rgmii_ctrl);
                }
  
                reg = SW_OVERRIDE | LINK_STS;
@@@ -942,8 -861,6 +947,8 @@@ static int bcm_sf2_sw_resume(struct dsa
                return ret;
        }
  
 +      bcm_sf2_crossbar_setup(priv);
 +
        ret = bcm_sf2_cfp_resume(ds);
        if (ret)
                return ret;
@@@ -1216,7 -1133,6 +1221,7 @@@ struct bcm_sf2_of_data 
        const u16 *reg_offsets;
        unsigned int core_reg_align;
        unsigned int num_cfp_rules;
 +      unsigned int num_crossbar_int_ports;
  };
  
  static const u16 bcm_sf2_4908_reg_offsets[] = {
        [REG_PHY_REVISION]      = 0x14,
        [REG_SPHY_CNTRL]        = 0x24,
        [REG_CROSSBAR]          = 0xc8,
 -      [REG_RGMII_0_CNTRL]     = 0xe0,
 -      [REG_RGMII_1_CNTRL]     = 0xec,
 -      [REG_RGMII_2_CNTRL]     = 0xf8,
 +      [REG_RGMII_11_CNTRL]    = 0x014c,
        [REG_LED_0_CNTRL]       = 0x40,
        [REG_LED_1_CNTRL]       = 0x4c,
        [REG_LED_2_CNTRL]       = 0x58,
@@@ -1238,8 -1156,7 +1243,8 @@@ static const struct bcm_sf2_of_data bcm
        .type           = BCM4908_DEVICE_ID,
        .core_reg_align = 0,
        .reg_offsets    = bcm_sf2_4908_reg_offsets,
 -      .num_cfp_rules  = 0, /* FIXME */
 +      .num_cfp_rules  = 256,
 +      .num_crossbar_int_ports = 2,
  };
  
  /* Register offsets for the SWITCH_REG_* block */
@@@ -1350,7 -1267,6 +1355,7 @@@ static int bcm_sf2_sw_probe(struct plat
        priv->reg_offsets = data->reg_offsets;
        priv->core_reg_align = data->core_reg_align;
        priv->num_cfp_rules = data->num_cfp_rules;
 +      priv->num_crossbar_int_ports = data->num_crossbar_int_ports;
  
        priv->rcdev = devm_reset_control_get_optional_exclusive(&pdev->dev,
                                                                "switch");
                goto out_clk_mdiv;
        }
  
 +      bcm_sf2_crossbar_setup(priv);
 +
        bcm_sf2_gphy_enable_set(priv->dev->ds, true);
  
        ret = bcm_sf2_mdio_register(ds);
diff --combined drivers/net/dsa/mt7530.c
@@@ -436,34 -436,32 +436,32 @@@ mt7530_pad_clk_setup(struct dsa_switch 
                             TD_DM_DRVP(8) | TD_DM_DRVN(8));
  
        /* Setup core clock for MT7530 */
-       if (!trgint) {
-               /* Disable MT7530 core clock */
-               core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-               /* Disable PLL, since phy_device has not yet been created
-                * provided for phy_[read,write]_mmd_indirect is called, we
-                * provide our own core_write_mmd_indirect to complete this
-                * function.
-                */
-               core_write_mmd_indirect(priv,
-                                       CORE_GSWPLL_GRP1,
-                                       MDIO_MMD_VEND2,
-                                       0);
-               /* Set core clock into 500Mhz */
-               core_write(priv, CORE_GSWPLL_GRP2,
-                          RG_GSWPLL_POSDIV_500M(1) |
-                          RG_GSWPLL_FBKDIV_500M(25));
-               /* Enable PLL */
-               core_write(priv, CORE_GSWPLL_GRP1,
-                          RG_GSWPLL_EN_PRE |
-                          RG_GSWPLL_POSDIV_200M(2) |
-                          RG_GSWPLL_FBKDIV_200M(32));
-               /* Enable MT7530 core clock */
-               core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
-       }
+       /* Disable MT7530 core clock */
+       core_clear(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
+       /* Disable PLL, since phy_device has not yet been created
+        * provided for phy_[read,write]_mmd_indirect is called, we
+        * provide our own core_write_mmd_indirect to complete this
+        * function.
+        */
+       core_write_mmd_indirect(priv,
+                               CORE_GSWPLL_GRP1,
+                               MDIO_MMD_VEND2,
+                               0);
+       /* Set core clock into 500Mhz */
+       core_write(priv, CORE_GSWPLL_GRP2,
+                  RG_GSWPLL_POSDIV_500M(1) |
+                  RG_GSWPLL_FBKDIV_500M(25));
+       /* Enable PLL */
+       core_write(priv, CORE_GSWPLL_GRP1,
+                  RG_GSWPLL_EN_PRE |
+                  RG_GSWPLL_POSDIV_200M(2) |
+                  RG_GSWPLL_FBKDIV_200M(32));
+       /* Enable MT7530 core clock */
+       core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
  
        /* Setup the MT7530 TRGMII Tx Clock */
        core_set(priv, CORE_TRGMII_GSW_CLK_CG, REG_GSWCK_EN);
@@@ -999,9 -997,8 +997,9 @@@ mt753x_cpu_port_enable(struct dsa_switc
        mt7530_write(priv, MT7530_PVC_P(port),
                     PORT_SPEC_TAG);
  
 -      /* Unknown multicast frame forwarding to the cpu port */
 -      mt7530_rmw(priv, MT7530_MFC, UNM_FFP_MASK, UNM_FFP(BIT(port)));
 +      /* Disable flooding by default */
 +      mt7530_rmw(priv, MT7530_MFC, BC_FFP_MASK | UNM_FFP_MASK | UNU_FFP_MASK,
 +                 BC_FFP(BIT(port)) | UNM_FFP(BIT(port)) | UNU_FFP(BIT(port)));
  
        /* Set CPU port number */
        if (priv->id == ID_MT7621)
@@@ -1138,56 -1135,6 +1136,56 @@@ mt7530_stp_state_set(struct dsa_switch 
        mt7530_rmw(priv, MT7530_SSP_P(port), FID_PST_MASK, stp_state);
  }
  
 +static int
 +mt7530_port_pre_bridge_flags(struct dsa_switch *ds, int port,
 +                           struct switchdev_brport_flags flags,
 +                           struct netlink_ext_ack *extack)
 +{
 +      if (flags.mask & ~(BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD |
 +                         BR_BCAST_FLOOD))
 +              return -EINVAL;
 +
 +      return 0;
 +}
 +
 +static int
 +mt7530_port_bridge_flags(struct dsa_switch *ds, int port,
 +                       struct switchdev_brport_flags flags,
 +                       struct netlink_ext_ack *extack)
 +{
 +      struct mt7530_priv *priv = ds->priv;
 +
 +      if (flags.mask & BR_LEARNING)
 +              mt7530_rmw(priv, MT7530_PSC_P(port), SA_DIS,
 +                         flags.val & BR_LEARNING ? 0 : SA_DIS);
 +
 +      if (flags.mask & BR_FLOOD)
 +              mt7530_rmw(priv, MT7530_MFC, UNU_FFP(BIT(port)),
 +                         flags.val & BR_FLOOD ? UNU_FFP(BIT(port)) : 0);
 +
 +      if (flags.mask & BR_MCAST_FLOOD)
 +              mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
 +                         flags.val & BR_MCAST_FLOOD ? UNM_FFP(BIT(port)) : 0);
 +
 +      if (flags.mask & BR_BCAST_FLOOD)
 +              mt7530_rmw(priv, MT7530_MFC, BC_FFP(BIT(port)),
 +                         flags.val & BR_BCAST_FLOOD ? BC_FFP(BIT(port)) : 0);
 +
 +      return 0;
 +}
 +
 +static int
 +mt7530_port_set_mrouter(struct dsa_switch *ds, int port, bool mrouter,
 +                      struct netlink_ext_ack *extack)
 +{
 +      struct mt7530_priv *priv = ds->priv;
 +
 +      mt7530_rmw(priv, MT7530_MFC, UNM_FFP(BIT(port)),
 +                 mrouter ? UNM_FFP(BIT(port)) : 0);
 +
 +      return 0;
 +}
 +
  static int
  mt7530_port_bridge_join(struct dsa_switch *ds, int port,
                        struct net_device *bridge)
        return 0;
  }
  
 +static int
 +mt7530_port_mdb_add(struct dsa_switch *ds, int port,
 +                  const struct switchdev_obj_port_mdb *mdb)
 +{
 +      struct mt7530_priv *priv = ds->priv;
 +      const u8 *addr = mdb->addr;
 +      u16 vid = mdb->vid;
 +      u8 port_mask = 0;
 +      int ret;
 +
 +      mutex_lock(&priv->reg_mutex);
 +
 +      mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
 +      if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
 +              port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
 +                          & PORT_MAP_MASK;
 +
 +      port_mask |= BIT(port);
 +      mt7530_fdb_write(priv, vid, port_mask, addr, -1, STATIC_ENT);
 +      ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
 +
 +      mutex_unlock(&priv->reg_mutex);
 +
 +      return ret;
 +}
 +
 +static int
 +mt7530_port_mdb_del(struct dsa_switch *ds, int port,
 +                  const struct switchdev_obj_port_mdb *mdb)
 +{
 +      struct mt7530_priv *priv = ds->priv;
 +      const u8 *addr = mdb->addr;
 +      u16 vid = mdb->vid;
 +      u8 port_mask = 0;
 +      int ret;
 +
 +      mutex_lock(&priv->reg_mutex);
 +
 +      mt7530_fdb_write(priv, vid, 0, addr, 0, STATIC_EMP);
 +      if (!mt7530_fdb_cmd(priv, MT7530_FDB_READ, NULL))
 +              port_mask = (mt7530_read(priv, MT7530_ATRD) >> PORT_MAP)
 +                          & PORT_MAP_MASK;
 +
 +      port_mask &= ~BIT(port);
 +      mt7530_fdb_write(priv, vid, port_mask, addr, -1,
 +                       port_mask ? STATIC_ENT : STATIC_EMP);
 +      ret = mt7530_fdb_cmd(priv, MT7530_FDB_WRITE, NULL);
 +
 +      mutex_unlock(&priv->reg_mutex);
 +
 +      return ret;
 +}
 +
  static int
  mt7530_vlan_cmd(struct mt7530_priv *priv, enum mt7530_vlan_cmd cmd, u16 vid)
  {
@@@ -1924,12 -1818,9 +1922,12 @@@ mt7530_setup(struct dsa_switch *ds
                        ret = mt753x_cpu_port_enable(ds, i);
                        if (ret)
                                return ret;
 -              } else
 +              } else {
                        mt7530_port_disable(ds, i);
  
 +                      /* Disable learning by default on all user ports */
 +                      mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
 +              }
                /* Enable consistent egress tag */
                mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
                           PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
@@@ -2091,13 -1982,9 +2089,13 @@@ mt7531_setup(struct dsa_switch *ds
                        ret = mt753x_cpu_port_enable(ds, i);
                        if (ret)
                                return ret;
 -              } else
 +              } else {
                        mt7530_port_disable(ds, i);
  
 +                      /* Disable learning by default on all user ports */
 +                      mt7530_set(priv, MT7530_PSC_P(i), SA_DIS);
 +              }
 +
                /* Enable consistent egress tag */
                mt7530_rmw(priv, MT7530_PVC_P(i), PVC_EG_TAG_MASK,
                           PVC_EG_TAG(MT7530_VLAN_EG_CONSISTENT));
@@@ -2819,16 -2706,11 +2817,16 @@@ static const struct dsa_switch_ops mt75
        .port_change_mtu        = mt7530_port_change_mtu,
        .port_max_mtu           = mt7530_port_max_mtu,
        .port_stp_state_set     = mt7530_stp_state_set,
 +      .port_pre_bridge_flags  = mt7530_port_pre_bridge_flags,
 +      .port_bridge_flags      = mt7530_port_bridge_flags,
 +      .port_set_mrouter       = mt7530_port_set_mrouter,
        .port_bridge_join       = mt7530_port_bridge_join,
        .port_bridge_leave      = mt7530_port_bridge_leave,
        .port_fdb_add           = mt7530_port_fdb_add,
        .port_fdb_del           = mt7530_port_fdb_del,
        .port_fdb_dump          = mt7530_port_fdb_dump,
 +      .port_mdb_add           = mt7530_port_mdb_add,
 +      .port_mdb_del           = mt7530_port_mdb_del,
        .port_vlan_filtering    = mt7530_port_vlan_filtering,
        .port_vlan_add          = mt7530_port_vlan_add,
        .port_vlan_del          = mt7530_port_vlan_del,
@@@ -25,7 -25,6 +25,7 @@@
  #include <linux/pm_runtime.h>
  #include <linux/aer.h>
  #include <linux/prefetch.h>
 +#include <linux/suspend.h>
  
  #include "e1000.h"
  
@@@ -5975,19 -5974,23 +5975,23 @@@ static void e1000_reset_task(struct wor
        struct e1000_adapter *adapter;
        adapter = container_of(work, struct e1000_adapter, reset_task);
  
+       rtnl_lock();
        /* don't run the task if already down */
-       if (test_bit(__E1000_DOWN, &adapter->state))
+       if (test_bit(__E1000_DOWN, &adapter->state)) {
+               rtnl_unlock();
                return;
+       }
  
        if (!(adapter->flags & FLAG_RESTART_NOW)) {
                e1000e_dump(adapter);
                e_err("Reset adapter unexpectedly\n");
        }
        e1000e_reinit_locked(adapter);
+       rtnl_unlock();
  }
  
  /**
 - * e1000_get_stats64 - Get System Network Statistics
 + * e1000e_get_stats64 - Get System Network Statistics
   * @netdev: network interface device structure
   * @stats: rtnl_link_stats64 pointer
   *
@@@ -6160,7 -6163,7 +6164,7 @@@ static int e1000_mii_ioctl(struct net_d
  }
  
  /**
 - * e1000e_hwtstamp_ioctl - control hardware time stamping
 + * e1000e_hwtstamp_set - control hardware time stamping
   * @netdev: network interface device structure
   * @ifr: interface request
   *
@@@ -6818,7 -6821,7 +6822,7 @@@ static void e1000e_disable_aspm(struct 
  }
  
  /**
 - * e1000e_disable_aspm_locked   Disable ASPM states.
 + * e1000e_disable_aspm_locked - Disable ASPM states.
   * @pdev: pointer to PCI device struct
   * @state: bit-mask of ASPM states to disable
   *
@@@ -6919,12 -6922,6 +6923,12 @@@ static int __e1000_resume(struct pci_de
        return 0;
  }
  
 +static __maybe_unused int e1000e_pm_prepare(struct device *dev)
 +{
 +      return pm_runtime_suspended(dev) &&
 +              pm_suspend_via_firmware();
 +}
 +
  static __maybe_unused int e1000e_pm_suspend(struct device *dev)
  {
        struct net_device *netdev = pci_get_drvdata(to_pci_dev(dev));
@@@ -7633,9 -7630,9 +7637,9 @@@ static int e1000_probe(struct pci_dev *
  
        e1000_print_device_info(adapter);
  
 -      dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_NO_DIRECT_COMPLETE);
 +      dev_pm_set_driver_flags(&pdev->dev, DPM_FLAG_SMART_PREPARE);
  
 -      if (pci_dev_run_wake(pdev) && hw->mac.type < e1000_pch_cnp)
 +      if (pci_dev_run_wake(pdev) && hw->mac.type != e1000_pch_cnp)
                pm_runtime_put_noidle(&pdev->dev);
  
        return 0;
@@@ -7858,7 -7855,6 +7862,7 @@@ MODULE_DEVICE_TABLE(pci, e1000_pci_tbl)
  
  static const struct dev_pm_ops e1000_pm_ops = {
  #ifdef CONFIG_PM_SLEEP
 +      .prepare        = e1000e_pm_prepare,
        .suspend        = e1000e_pm_suspend,
        .resume         = e1000e_pm_resume,
        .freeze         = e1000e_pm_freeze,
@@@ -2023,7 -2023,7 +2023,7 @@@ static void i40e_undo_add_filter_entrie
  }
  
  /**
 - * i40e_next_entry - Get the next non-broadcast filter from a list
 + * i40e_next_filter - Get the next non-broadcast filter from a list
   * @next: pointer to filter in list
   *
   * Returns the next non-broadcast filter in the list. Required so that we
@@@ -3258,6 -3258,17 +3258,17 @@@ static int i40e_configure_tx_ring(struc
        return 0;
  }
  
+ /**
+  * i40e_rx_offset - Return expected offset into page to access data
+  * @rx_ring: Ring we are requesting offset of
+  *
+  * Returns the offset value for ring into the data buffer.
+  */
+ static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
+ {
+       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
+ }
  /**
   * i40e_configure_rx_ring - Configure a receive ring context
   * @ring: The Rx ring to configure
@@@ -3369,6 -3380,8 +3380,8 @@@ static int i40e_configure_rx_ring(struc
        else
                set_ring_build_skb_enabled(ring);
  
+       ring->rx_offset = i40e_rx_offset(ring);
        /* cache tail for quicker writes, and clear the reg before use */
        ring->tail = hw->hw_addr + I40E_QRX_TAIL(pf_q);
        writel(0, ring->tail);
@@@ -5191,7 -5204,7 +5204,7 @@@ static u8 i40e_pf_get_num_tc(struct i40
  }
  
  /**
 - * i40e_pf_get_pf_tc_map - Get bitmap for enabled traffic classes
 + * i40e_pf_get_tc_map - Get bitmap for enabled traffic classes
   * @pf: PF being queried
   *
   * Return a bitmap for enabled traffic classes for this PF.
@@@ -9454,7 -9467,7 +9467,7 @@@ static void i40e_fdir_flush_and_replay(
  }
  
  /**
 - * i40e_get_current_atr_count - Get the count of total FD ATR filters programmed
 + * i40e_get_current_atr_cnt - Get the count of total FD ATR filters programmed
   * @pf: board private structure
   **/
  u32 i40e_get_current_atr_cnt(struct i40e_pf *pf)
@@@ -1569,17 -1569,6 +1569,6 @@@ void i40e_free_rx_resources(struct i40e
        }
  }
  
- /**
-  * i40e_rx_offset - Return expected offset into page to access data
-  * @rx_ring: Ring we are requesting offset of
-  *
-  * Returns the offset value for ring into the data buffer.
-  */
- static unsigned int i40e_rx_offset(struct i40e_ring *rx_ring)
- {
-       return ring_uses_build_skb(rx_ring) ? I40E_SKB_PAD : 0;
- }
  /**
   * i40e_setup_rx_descriptors - Allocate Rx descriptors
   * @rx_ring: Rx descriptor ring (for a specific queue) to setup
@@@ -1608,7 -1597,6 +1597,6 @@@ int i40e_setup_rx_descriptors(struct i4
        rx_ring->next_to_alloc = 0;
        rx_ring->next_to_clean = 0;
        rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = i40e_rx_offset(rx_ring);
  
        /* XDP RX-queue info only needed for RX rings exposed to XDP */
        if (rx_ring->vsi->type == I40E_VSI_MAIN) {
@@@ -3345,7 -3333,7 +3333,7 @@@ static int i40e_tx_enable_csum(struct s
  }
  
  /**
 - * i40e_create_tx_ctx Build the Tx context descriptor
 + * i40e_create_tx_ctx Build the Tx context descriptor
   * @tx_ring:  ring to create the descriptor on
   * @cd_type_cmd_tso_mss: Quad Word 1
   * @cd_tunneling: Quad Word 0 - bits 0-31
@@@ -443,22 -443,6 +443,6 @@@ void ice_free_rx_ring(struct ice_ring *
        }
  }
  
- /**
-  * ice_rx_offset - Return expected offset into page to access data
-  * @rx_ring: Ring we are requesting offset of
-  *
-  * Returns the offset value for ring into the data buffer.
-  */
- static unsigned int ice_rx_offset(struct ice_ring *rx_ring)
- {
-       if (ice_ring_uses_build_skb(rx_ring))
-               return ICE_SKB_PAD;
-       else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
-               return XDP_PACKET_HEADROOM;
-       return 0;
- }
  /**
   * ice_setup_rx_ring - Allocate the Rx descriptors
   * @rx_ring: the Rx ring to set up
@@@ -493,7 -477,6 +477,6 @@@ int ice_setup_rx_ring(struct ice_ring *
  
        rx_ring->next_to_use = 0;
        rx_ring->next_to_clean = 0;
-       rx_ring->rx_offset = ice_rx_offset(rx_ring);
  
        if (ice_is_xdp_ena_vsi(rx_ring->vsi))
                WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
@@@ -1115,11 -1098,6 +1098,11 @@@ int ice_clean_rx_irq(struct ice_ring *r
                dma_rmb();
  
                if (rx_desc->wb.rxdid == FDIR_DESC_RXDID || !rx_ring->netdev) {
 +                      struct ice_vsi *ctrl_vsi = rx_ring->vsi;
 +
 +                      if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
 +                          ctrl_vsi->vf_id != ICE_INVAL_VFID)
 +                              ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
                        ice_put_rx_buf(rx_ring, NULL, 0);
                        cleaned_count++;
                        continue;
@@@ -358,18 -358,18 +358,18 @@@ xsk_pool_if_up
   * This function allocates a number of Rx buffers from the fill ring
   * or the internal recycle mechanism and places them on the Rx ring.
   *
-  * Returns false if all allocations were successful, true if any fail.
+  * Returns true if all allocations were successful, false if any fail.
   */
  bool ice_alloc_rx_bufs_zc(struct ice_ring *rx_ring, u16 count)
  {
        union ice_32b_rx_flex_desc *rx_desc;
        u16 ntu = rx_ring->next_to_use;
        struct ice_rx_buf *rx_buf;
-       bool ret = false;
+       bool ok = true;
        dma_addr_t dma;
  
        if (!count)
-               return false;
+               return true;
  
        rx_desc = ICE_RX_DESC(rx_ring, ntu);
        rx_buf = &rx_ring->rx_buf[ntu];
        do {
                rx_buf->xdp = xsk_buff_alloc(rx_ring->xsk_pool);
                if (!rx_buf->xdp) {
-                       ret = true;
+                       ok = false;
                        break;
                }
  
                ice_release_rx_desc(rx_ring, ntu);
        }
  
-       return ret;
+       return ok;
  }
  
  /**
@@@ -473,14 -473,6 +473,14 @@@ ice_run_xdp_zc(struct ice_ring *rx_ring
        xdp_prog = READ_ONCE(rx_ring->xdp_prog);
  
        act = bpf_prog_run_xdp(xdp_prog, xdp);
 +
 +      if (likely(act == XDP_REDIRECT)) {
 +              err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
 +              result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
 +              rcu_read_unlock();
 +              return result;
 +      }
 +
        switch (act) {
        case XDP_PASS:
                break;
                xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->q_index];
                result = ice_xmit_xdp_buff(xdp, xdp_ring);
                break;
 -      case XDP_REDIRECT:
 -              err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
 -              result = !err ? ICE_XDP_REDIR : ICE_XDP_CONSUMED;
 -              break;
        default:
                bpf_warn_invalid_xdp_action(act);
                fallthrough;
@@@ -2037,7 -2037,7 +2037,7 @@@ static void igb_power_down_link(struct 
  }
  
  /**
 - * Detect and switch function for Media Auto Sense
 + * igb_check_swap_media -  Detect and switch function for Media Auto Sense
   * @adapter: address of the board private structure
   **/
  static void igb_check_swap_media(struct igb_adapter *adapter)
@@@ -3115,7 -3115,7 +3115,7 @@@ static s32 igb_init_i2c(struct igb_adap
                return 0;
  
        /* Initialize the i2c bus which is controlled by the registers.
 -       * This bus will use the i2c_algo_bit structue that implements
 +       * This bus will use the i2c_algo_bit structure that implements
         * the protocol through toggling of the 4 bits in the register.
         */
        adapter->i2c_adap.owner = THIS_MODULE;
@@@ -4020,7 -4020,7 +4020,7 @@@ static int igb_sw_init(struct igb_adapt
  }
  
  /**
 - *  igb_open - Called when a network interface is made active
 + *  __igb_open - Called when a network interface is made active
   *  @netdev: network interface device structure
   *  @resuming: indicates whether we are in a resume call
   *
@@@ -4138,7 -4138,7 +4138,7 @@@ int igb_open(struct net_device *netdev
  }
  
  /**
 - *  igb_close - Disables a network interface
 + *  __igb_close - Disables a network interface
   *  @netdev: network interface device structure
   *  @suspending: indicates we are in a suspend call
   *
@@@ -5856,7 -5856,7 +5856,7 @@@ static void igb_tx_ctxtdesc(struct igb_
         */
        if (tx_ring->launchtime_enable) {
                ts = ktime_to_timespec64(first->skb->tstamp);
 -              first->skb->tstamp = ktime_set(0, 0);
 +              skb_txtime_consumed(first->skb);
                context_desc->seqnum_seed = cpu_to_le32(ts.tv_nsec / 32);
        } else {
                context_desc->seqnum_seed = 0;
@@@ -8214,7 -8214,8 +8214,8 @@@ static void igb_reuse_rx_page(struct ig
        new_buff->pagecnt_bias  = old_buff->pagecnt_bias;
  }
  
- static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer)
+ static bool igb_can_reuse_rx_page(struct igb_rx_buffer *rx_buffer,
+                                 int rx_buf_pgcnt)
  {
        unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
        struct page *page = rx_buffer->page;
  
  #if (PAGE_SIZE < 8192)
        /* if we are only owner of page we can reuse it */
-       if (unlikely((page_ref_count(page) - pagecnt_bias) > 1))
+       if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
                return false;
  #else
  #define IGB_LAST_OFFSET \
@@@ -8301,9 -8302,10 +8302,10 @@@ static struct sk_buff *igb_construct_sk
                return NULL;
  
        if (unlikely(igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb);
-               xdp->data += IGB_TS_HDR_LEN;
-               size -= IGB_TS_HDR_LEN;
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, xdp->data, skb)) {
+                       xdp->data += IGB_TS_HDR_LEN;
+                       size -= IGB_TS_HDR_LEN;
+               }
        }
  
        /* Determine available headroom for copy */
@@@ -8364,8 -8366,8 +8366,8 @@@ static struct sk_buff *igb_build_skb(st
  
        /* pull timestamp out of packet data */
        if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
-               igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb);
-               __skb_pull(skb, IGB_TS_HDR_LEN);
+               if (!igb_ptp_rx_pktstamp(rx_ring->q_vector, skb->data, skb))
+                       __skb_pull(skb, IGB_TS_HDR_LEN);
        }
  
        /* update buffer offset */
@@@ -8614,11 -8616,17 +8616,17 @@@ static unsigned int igb_rx_offset(struc
  }
  
  static struct igb_rx_buffer *igb_get_rx_buffer(struct igb_ring *rx_ring,
-                                              const unsigned int size)
+                                              const unsigned int size, int *rx_buf_pgcnt)
  {
        struct igb_rx_buffer *rx_buffer;
  
        rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+       *rx_buf_pgcnt =
+ #if (PAGE_SIZE < 8192)
+               page_count(rx_buffer->page);
+ #else
+               0;
+ #endif
        prefetchw(rx_buffer->page);
  
        /* we are reusing so sync this buffer for CPU use */
  }
  
  static void igb_put_rx_buffer(struct igb_ring *rx_ring,
-                             struct igb_rx_buffer *rx_buffer)
+                             struct igb_rx_buffer *rx_buffer, int rx_buf_pgcnt)
  {
-       if (igb_can_reuse_rx_page(rx_buffer)) {
+       if (igb_can_reuse_rx_page(rx_buffer, rx_buf_pgcnt)) {
                /* hand second half of page back to the ring */
                igb_reuse_rx_page(rx_ring, rx_buffer);
        } else {
@@@ -8664,6 -8672,7 +8672,7 @@@ static int igb_clean_rx_irq(struct igb_
        unsigned int xdp_xmit = 0;
        struct xdp_buff xdp;
        u32 frame_sz = 0;
+       int rx_buf_pgcnt;
  
        /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
  #if (PAGE_SIZE < 8192)
                 */
                dma_rmb();
  
-               rx_buffer = igb_get_rx_buffer(rx_ring, size);
+               rx_buffer = igb_get_rx_buffer(rx_ring, size, &rx_buf_pgcnt);
  
                /* retrieve a buffer from the ring */
                if (!skb) {
                        break;
                }
  
-               igb_put_rx_buffer(rx_ring, rx_buffer);
+               igb_put_rx_buffer(rx_ring, rx_buffer, rx_buf_pgcnt);
                cleaned_count++;
  
                /* fetch next buffer in frame if non-eop */
@@@ -856,6 -856,9 +856,9 @@@ static void igb_ptp_tx_hwtstamp(struct 
        dev_kfree_skb_any(skb);
  }
  
+ #define IGB_RET_PTP_DISABLED 1
+ #define IGB_RET_PTP_INVALID 2
  /**
   * igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp
   * @q_vector: Pointer to interrupt specific structure
   *
   * This function is meant to retrieve a timestamp from the first buffer of an
   * incoming frame.  The value is stored in little endian format starting on
-  * byte 8.
+  * byte 8
+  *
+  * Returns: 0 if success, nonzero if failure
   **/
void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
-                        struct sk_buff *skb)
int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+                       struct sk_buff *skb)
  {
-       __le64 *regval = (__le64 *)va;
        struct igb_adapter *adapter = q_vector->adapter;
+       __le64 *regval = (__le64 *)va;
        int adjust = 0;
  
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return IGB_RET_PTP_DISABLED;
        /* The timestamp is recorded in little endian format.
         * DWORD: 0        1        2        3
         * Field: Reserved Reserved SYSTIML  SYSTIMH
         */
+       /* check reserved dwords are zero, be/le doesn't matter for zero */
+       if (regval[0])
+               return IGB_RET_PTP_INVALID;
        igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb),
                                   le64_to_cpu(regval[1]));
  
        }
        skb_hwtstamps(skb)->hwtstamp =
                ktime_sub_ns(skb_hwtstamps(skb)->hwtstamp, adjust);
+       return 0;
  }
  
  /**
   * This function is meant to retrieve a timestamp from the internal registers
   * of the adapter and store it in the skb.
   **/
- void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
-                        struct sk_buff *skb)
+ void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector, struct sk_buff *skb)
  {
        struct igb_adapter *adapter = q_vector->adapter;
        struct e1000_hw *hw = &adapter->hw;
-       u64 regval;
        int adjust = 0;
+       u64 regval;
+       if (!(adapter->ptp_flags & IGB_PTP_ENABLED))
+               return;
  
        /* If this bit is set, then the RX registers contain the time stamp. No
         * other packet will be time stamped until we read these registers, so
@@@ -1008,7 -1025,6 +1025,7 @@@ static int igb_ptp_set_timestamp_mode(s
        switch (config->tx_type) {
        case HWTSTAMP_TX_OFF:
                tsync_tx_ctl = 0;
 +              break;
        case HWTSTAMP_TX_ON:
                break;
        default:
@@@ -941,7 -941,7 +941,7 @@@ static void igc_tx_ctxtdesc(struct igc_
                struct igc_adapter *adapter = netdev_priv(tx_ring->netdev);
                ktime_t txtime = first->skb->tstamp;
  
 -              first->skb->tstamp = ktime_set(0, 0);
 +              skb_txtime_consumed(first->skb);
                context_desc->launch_time = igc_tx_launchtime(adapter,
                                                              txtime);
        } else {
@@@ -3580,7 -3580,7 +3580,7 @@@ void igc_up(struct igc_adapter *adapter
        netif_tx_start_all_queues(adapter->netdev);
  
        /* start the watchdog. */
 -      hw->mac.get_link_status = 1;
 +      hw->mac.get_link_status = true;
        schedule_work(&adapter->watchdog_task);
  }
  
@@@ -3831,10 -3831,19 +3831,19 @@@ static void igc_reset_task(struct work_
  
        adapter = container_of(work, struct igc_adapter, reset_task);
  
+       rtnl_lock();
+       /* If we're already down or resetting, just bail */
+       if (test_bit(__IGC_DOWN, &adapter->state) ||
+           test_bit(__IGC_RESETTING, &adapter->state)) {
+               rtnl_unlock();
+               return;
+       }
        igc_rings_dump(adapter);
        igc_regs_dump(adapter);
        netdev_err(adapter->netdev, "Reset adapter\n");
        igc_reinit_locked(adapter);
+       rtnl_unlock();
  }
  
  /**
@@@ -4000,7 -4009,7 +4009,7 @@@ static irqreturn_t igc_msix_other(int i
        }
  
        if (icr & IGC_ICR_LSC) {
 -              hw->mac.get_link_status = 1;
 +              hw->mac.get_link_status = true;
                /* guard against interrupt when we're going down */
                if (!test_bit(__IGC_DOWN, &adapter->state))
                        mod_timer(&adapter->watchdog_timer, jiffies + 1);
@@@ -4378,7 -4387,7 +4387,7 @@@ static irqreturn_t igc_intr_msi(int irq
        }
  
        if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
 -              hw->mac.get_link_status = 1;
 +              hw->mac.get_link_status = true;
                if (!test_bit(__IGC_DOWN, &adapter->state))
                        mod_timer(&adapter->watchdog_timer, jiffies + 1);
        }
@@@ -4420,7 -4429,7 +4429,7 @@@ static irqreturn_t igc_intr(int irq, vo
        }
  
        if (icr & (IGC_ICR_RXSEQ | IGC_ICR_LSC)) {
 -              hw->mac.get_link_status = 1;
 +              hw->mac.get_link_status = true;
                /* guard against interrupt when we're going down */
                if (!test_bit(__IGC_DOWN, &adapter->state))
                        mod_timer(&adapter->watchdog_timer, jiffies + 1);
@@@ -4574,7 -4583,7 +4583,7 @@@ static int __igc_open(struct net_devic
        netif_tx_start_all_queues(netdev);
  
        /* start the watchdog. */
 -      hw->mac.get_link_status = 1;
 +      hw->mac.get_link_status = true;
        schedule_work(&adapter->watchdog_task);
  
        return IGC_SUCCESS;
@@@ -4915,7 -4924,7 +4924,7 @@@ int igc_set_spd_dplx(struct igc_adapte
  {
        struct igc_mac_info *mac = &adapter->hw.mac;
  
 -      mac->autoneg = 0;
 +      mac->autoneg = false;
  
        /* Make sure dplx is at most 1 bit and lsb of speed is not set
         * for the switch() below to work
                mac->forced_speed_duplex = ADVERTISE_100_FULL;
                break;
        case SPEED_1000 + DUPLEX_FULL:
 -              mac->autoneg = 1;
 +              mac->autoneg = true;
                adapter->hw.phy.autoneg_advertised = ADVERTISE_1000_FULL;
                break;
        case SPEED_1000 + DUPLEX_HALF: /* not supported */
                goto err_inval;
        case SPEED_2500 + DUPLEX_FULL:
 -              mac->autoneg = 1;
 +              mac->autoneg = true;
                adapter->hw.phy.autoneg_advertised = ADVERTISE_2500_FULL;
                break;
        case SPEED_2500 + DUPLEX_HALF: /* not supported */
@@@ -225,7 -225,7 +225,7 @@@ static s32 ixgbe_get_parent_bus_info(st
  }
  
  /**
 - * ixgbe_check_from_parent - Determine whether PCIe info should come from parent
 + * ixgbe_pcie_from_parent - Determine whether PCIe info should come from parent
   * @hw: hw specific details
   *
   * This function is used by probe to determine whether a device's PCI-Express
@@@ -4118,6 -4118,8 +4118,8 @@@ void ixgbe_configure_rx_ring(struct ixg
  #endif
        }
  
+       ring->rx_offset = ixgbe_rx_offset(ring);
        if (ring->xsk_pool && hw->mac.type != ixgbe_mac_82599EB) {
                u32 xsk_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
  
@@@ -6156,7 -6158,7 +6158,7 @@@ void ixgbe_down(struct ixgbe_adapter *a
  }
  
  /**
 - * ixgbe_eee_capable - helper function to determine EEE support on X550
 + * ixgbe_set_eee_capable - helper function to determine EEE support on X550
   * @adapter: board private structure
   */
  static void ixgbe_set_eee_capable(struct ixgbe_adapter *adapter)
@@@ -6578,7 -6580,6 +6580,6 @@@ int ixgbe_setup_rx_resources(struct ixg
  
        rx_ring->next_to_clean = 0;
        rx_ring->next_to_use = 0;
-       rx_ring->rx_offset = ixgbe_rx_offset(rx_ring);
  
        /* XDP RX-queue info */
        if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, adapter->netdev,
@@@ -548,12 -548,6 +548,12 @@@ static inline int is_afvf(u16 pcifunc
        return !(pcifunc & ~RVU_PFVF_FUNC_MASK);
  }
  
 +/* check if PF_FUNC is AF */
 +static inline bool is_pffunc_af(u16 pcifunc)
 +{
 +      return !pcifunc;
 +}
 +
  static inline bool is_rvu_fwdata_valid(struct rvu *rvu)
  {
        return (rvu->fwdata->header_magic == RVU_FWDATA_HEADER_MAGIC) &&
@@@ -646,8 -640,7 +646,8 @@@ int npc_config_ts_kpuaction(struct rvu 
  void rvu_npc_install_ucast_entry(struct rvu *rvu, u16 pcifunc,
                                 int nixlf, u64 chan, u8 *mac_addr);
  void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
 -                                 int nixlf, u64 chan, bool allmulti);
 +                                 int nixlf, u64 chan, u8 chan_cnt,
 +                                 bool allmulti);
  void rvu_npc_disable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
  void rvu_npc_enable_promisc_entry(struct rvu *rvu, u16 pcifunc, int nixlf);
  void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
@@@ -672,6 -665,9 +672,6 @@@ int rvu_npc_get_tx_nibble_cfg(struct rv
  int npc_mcam_verify_channel(struct rvu *rvu, u16 pcifunc, u8 intf, u16 channel);
  int npc_flow_steering_init(struct rvu *rvu, int blkaddr);
  const char *npc_get_field_name(u8 hdr);
 -bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
 -                              u16 pcifunc, u8 intf, struct mcam_entry *entry,
 -                              int *entry_index);
  int npc_get_bank(struct npc_mcam *mcam, int index);
  void npc_mcam_enable_flows(struct rvu *rvu, u16 target);
  void npc_mcam_disable_flows(struct rvu *rvu, u16 target);
@@@ -682,12 -678,8 +682,13 @@@ void npc_read_mcam_entry(struct rvu *rv
                         u8 *intf, u8 *ena);
  bool is_mac_feature_supported(struct rvu *rvu, int pf, int feature);
  u32  rvu_cgx_get_fifolen(struct rvu *rvu);
+ void *rvu_first_cgx_pdata(struct rvu *rvu);
  
 +int npc_get_nixlf_mcam_index(struct npc_mcam *mcam, u16 pcifunc, int nixlf,
 +                           int type);
 +bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam, int blkaddr,
 +                         int index);
 +
  /* CPT APIs */
  int rvu_cpt_lf_teardown(struct rvu *rvu, u16 pcifunc, int lf, int slot);
  
@@@ -234,12 -234,14 +234,14 @@@ static ssize_t rvu_dbg_rsrc_attach_stat
                                          char __user *buffer,
                                          size_t count, loff_t *ppos)
  {
-       int index, off = 0, flag = 0, go_back = 0, off_prev;
+       int index, off = 0, flag = 0, go_back = 0, len = 0;
        struct rvu *rvu = filp->private_data;
        int lf, pf, vf, pcifunc;
        struct rvu_block block;
        int bytes_not_copied;
+       int lf_str_size = 12;
        int buf_size = 2048;
+       char *lfs;
        char *buf;
  
        /* don't allow partial reads */
        buf = kzalloc(buf_size, GFP_KERNEL);
        if (!buf)
                return -ENOSPC;
-       off +=  scnprintf(&buf[off], buf_size - 1 - off, "\npcifunc\t\t");
+       lfs = kzalloc(lf_str_size, GFP_KERNEL);
+       if (!lfs) {
+               kfree(buf);
+               return -ENOMEM;
+       }
+       off +=  scnprintf(&buf[off], buf_size - 1 - off, "%-*s", lf_str_size,
+                         "pcifunc");
        for (index = 0; index < BLK_COUNT; index++)
-               if (strlen(rvu->hw->block[index].name))
-                       off +=  scnprintf(&buf[off], buf_size - 1 - off,
-                                         "%*s\t", (index - 1) * 2,
-                                         rvu->hw->block[index].name);
+               if (strlen(rvu->hw->block[index].name)) {
+                       off += scnprintf(&buf[off], buf_size - 1 - off,
+                                        "%-*s", lf_str_size,
+                                        rvu->hw->block[index].name);
+               }
        off += scnprintf(&buf[off], buf_size - 1 - off, "\n");
        for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
                for (vf = 0; vf <= rvu->hw->total_vfs; vf++) {
                                continue;
  
                        if (vf) {
+                               sprintf(lfs, "PF%d:VF%d", pf, vf - 1);
                                go_back = scnprintf(&buf[off],
                                                    buf_size - 1 - off,
-                                                   "PF%d:VF%d\t\t", pf,
-                                                   vf - 1);
+                                                   "%-*s", lf_str_size, lfs);
                        } else {
+                               sprintf(lfs, "PF%d", pf);
                                go_back = scnprintf(&buf[off],
                                                    buf_size - 1 - off,
-                                                   "PF%d\t\t", pf);
+                                                   "%-*s", lf_str_size, lfs);
                        }
  
                        off += go_back;
                                block = rvu->hw->block[index];
                                if (!strlen(block.name))
                                        continue;
-                               off_prev = off;
+                               len = 0;
+                               lfs[len] = '\0';
                                for (lf = 0; lf < block.lf.max; lf++) {
                                        if (block.fn_map[lf] != pcifunc)
                                                continue;
                                        flag = 1;
-                                       off += scnprintf(&buf[off], buf_size - 1
-                                                       - off, "%3d,", lf);
+                                       len += sprintf(&lfs[len], "%d,", lf);
                                }
-                               if (flag && off_prev != off)
-                                       off--;
-                               else
-                                       go_back++;
+                               if (flag)
+                                       len--;
+                               lfs[len] = '\0';
                                off += scnprintf(&buf[off], buf_size - 1 - off,
-                                               "\t");
+                                                "%-*s", lf_str_size, lfs);
+                               if (!strlen(lfs))
+                                       go_back += lf_str_size;
                        }
                        if (!flag)
                                off -= go_back;
        }
  
        bytes_not_copied = copy_to_user(buffer, buf, off);
+       kfree(lfs);
        kfree(buf);
  
        if (bytes_not_copied)
@@@ -319,7 -333,6 +333,6 @@@ static int rvu_dbg_rvu_pf_cgx_map_displ
        struct rvu *rvu = filp->private;
        struct pci_dev *pdev = NULL;
        struct mac_ops *mac_ops;
-       int rvu_def_cgx_id = 0;
        char cgx[10], lmac[10];
        struct rvu_pfvf *pfvf;
        int pf, domain, blkid;
        u16 pcifunc;
  
        domain = 2;
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
+       /* There can be no CGX devices at all */
+       if (!mac_ops)
+               return 0;
        seq_printf(filp, "PCI dev\t\tRVU PF Func\tNIX block\t%s\tLMAC\n",
                   mac_ops->name);
        for (pf = 0; pf < rvu->hw->total_pfs; pf++) {
@@@ -1818,7 -1834,6 +1834,6 @@@ static void rvu_dbg_cgx_init(struct rv
  {
        struct mac_ops *mac_ops;
        unsigned long lmac_bmap;
-       int rvu_def_cgx_id = 0;
        int i, lmac_id;
        char dname[20];
        void *cgx;
        if (!cgx_get_cgxcnt_max())
                return;
  
-       mac_ops = get_mac_ops(rvu_cgx_pdata(rvu_def_cgx_id, rvu));
+       mac_ops = get_mac_ops(rvu_first_cgx_pdata(rvu));
        if (!mac_ops)
                return;
  
@@@ -2002,7 -2017,7 +2017,7 @@@ static void rvu_dbg_npc_mcam_show_flows
                        seq_printf(s, "mask 0x%x\n", ntohs(rule->mask.etype));
                        break;
                case NPC_OUTER_VID:
 -                      seq_printf(s, "%d ", ntohs(rule->packet.vlan_tci));
 +                      seq_printf(s, "0x%x ", ntohs(rule->packet.vlan_tci));
                        seq_printf(s, "mask 0x%x\n",
                                   ntohs(rule->mask.vlan_tci));
                        break;
@@@ -2145,7 -2160,7 +2160,7 @@@ static int rvu_dbg_npc_mcam_show_rules(
                seq_printf(s, "\tmcam entry: %d\n", iter->entry);
  
                rvu_dbg_npc_mcam_show_flows(s, iter);
 -              if (iter->intf == NIX_INTF_RX) {
 +              if (is_npc_intf_rx(iter->intf)) {
                        target = iter->rx_action.pf_func;
                        pf = (target >> RVU_PFVF_PF_SHIFT) & RVU_PFVF_PF_MASK;
                        seq_printf(s, "\tForward to: PF%d ", pf);
@@@ -273,8 -273,7 +273,8 @@@ static int nix_interface_init(struct rv
                pfvf->rx_chan_cnt = 1;
                pfvf->tx_chan_cnt = 1;
                rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
 -                                            pfvf->rx_chan_base, false);
 +                                            pfvf->rx_chan_base,
 +                                            pfvf->rx_chan_cnt, false);
                break;
        }
  
@@@ -2630,7 -2629,7 +2630,7 @@@ static int set_flowkey_fields(struct ni
        struct nix_rx_flowkey_alg *field;
        struct nix_rx_flowkey_alg tmp;
        u32 key_type, valid_key;
-       int l4_key_offset;
+       int l4_key_offset = 0;
  
        if (!alg)
                return -EINVAL;
@@@ -3089,8 -3088,7 +3089,8 @@@ int rvu_mbox_handler_nix_set_rx_mode(st
                rvu_npc_disable_promisc_entry(rvu, pcifunc, nixlf);
        else
                rvu_npc_install_promisc_entry(rvu, pcifunc, nixlf,
 -                                            pfvf->rx_chan_base, allmulti);
 +                                            pfvf->rx_chan_base,
 +                                            pfvf->rx_chan_cnt, allmulti);
        return 0;
  }
  
@@@ -3637,7 -3635,9 +3637,7 @@@ int rvu_mbox_handler_nix_lf_stop_rx(str
        if (err)
                return err;
  
 -      rvu_npc_disable_default_entries(rvu, pcifunc, nixlf);
 -
 -      npc_mcam_disable_flows(rvu, pcifunc);
 +      rvu_npc_disable_mcam_entries(rvu, pcifunc, nixlf);
  
        return rvu_cgx_start_stop_io(rvu, pcifunc, false);
  }
  #define RSVD_MCAM_ENTRIES_PER_PF      2 /* Bcast & Promisc */
  #define RSVD_MCAM_ENTRIES_PER_NIXLF   1 /* Ucast for LFs */
  
 -#define NIXLF_UCAST_ENTRY     0
 -#define NIXLF_BCAST_ENTRY     1
 -#define NIXLF_PROMISC_ENTRY   2
 -
  #define NPC_PARSE_RESULT_DMAC_OFFSET  8
  #define NPC_HW_TSTAMP_OFFSET          8
  #define NPC_KEX_CHAN_MASK             0xFFFULL
@@@ -92,10 -96,6 +92,10 @@@ int npc_mcam_verify_channel(struct rvu 
        if (is_npc_intf_tx(intf))
                return 0;
  
 +      /* return in case of AF installed rules */
 +      if (is_pffunc_af(pcifunc))
 +              return 0;
 +
        if (is_afvf(pcifunc)) {
                end = rvu_get_num_lbk_chans();
                if (end < 0)
@@@ -196,8 -196,8 +196,8 @@@ static int npc_get_ucast_mcam_index(str
        return mcam->nixlf_offset + (max + nixlf) * RSVD_MCAM_ENTRIES_PER_NIXLF;
  }
  
 -static int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
 -                                  u16 pcifunc, int nixlf, int type)
 +int npc_get_nixlf_mcam_index(struct npc_mcam *mcam,
 +                           u16 pcifunc, int nixlf, int type)
  {
        int pf = rvu_get_pf(pcifunc);
        int index;
@@@ -230,8 -230,8 +230,8 @@@ int npc_get_bank(struct npc_mcam *mcam
        return bank;
  }
  
 -static bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
 -                                int blkaddr, int index)
 +bool is_mcam_entry_enabled(struct rvu *rvu, struct npc_mcam *mcam,
 +                         int blkaddr, int index)
  {
        int bank = npc_get_bank(mcam, index);
        u64 cfg;
@@@ -647,17 -647,13 +647,17 @@@ void rvu_npc_install_ucast_entry(struc
  }
  
  void rvu_npc_install_promisc_entry(struct rvu *rvu, u16 pcifunc,
 -                                 int nixlf, u64 chan, bool allmulti)
 +                                 int nixlf, u64 chan, u8 chan_cnt,
 +                                 bool allmulti)
  {
        struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
 +      struct npc_install_flow_req req = { 0 };
 +      struct npc_install_flow_rsp rsp = { 0 };
        struct npc_mcam *mcam = &rvu->hw->mcam;
 -      int blkaddr, ucast_idx, index, kwi;
 -      struct mcam_entry entry = { {0} };
 -      struct nix_rx_action action = { };
 +      int blkaddr, ucast_idx, index;
 +      u8 mac_addr[ETH_ALEN] = { 0 };
 +      struct nix_rx_action action;
 +      u64 relaxed_mask;
  
        /* Only PF or AF VF can add a promiscuous entry */
        if ((pcifunc & RVU_PFVF_FUNC_MASK) && !is_afvf(pcifunc))
        if (blkaddr < 0)
                return;
  
 +      *(u64 *)&action = 0x00;
        index = npc_get_nixlf_mcam_index(mcam, pcifunc,
                                         nixlf, NIXLF_PROMISC_ENTRY);
  
 -      entry.kw[0] = chan;
 -      entry.kw_mask[0] = 0xFFFULL;
 -
 -      if (allmulti) {
 -              kwi = NPC_KEXOF_DMAC / sizeof(u64);
 -              entry.kw[kwi] = BIT_ULL(40); /* LSB bit of 1st byte in DMAC */
 -              entry.kw_mask[kwi] = BIT_ULL(40);
 -      }
 -
 -      ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
 -                                           nixlf, NIXLF_UCAST_ENTRY);
 -
        /* If the corresponding PF's ucast action is RSS,
         * use the same action for promisc also
         */
 +      ucast_idx = npc_get_nixlf_mcam_index(mcam, pcifunc,
 +                                           nixlf, NIXLF_UCAST_ENTRY);
        if (is_mcam_entry_enabled(rvu, mcam, blkaddr, ucast_idx))
                *(u64 *)&action = npc_get_mcam_action(rvu, mcam,
                                                        blkaddr, ucast_idx);
                action.pf_func = pcifunc;
        }
  
 -      entry.action = *(u64 *)&action;
 -      npc_config_mcam_entry(rvu, mcam, blkaddr, index,
 -                            pfvf->nix_rx_intf, &entry, true);
 +      if (allmulti) {
 +              mac_addr[0] = 0x01;     /* LSB bit of 1st byte in DMAC */
 +              ether_addr_copy(req.packet.dmac, mac_addr);
 +              ether_addr_copy(req.mask.dmac, mac_addr);
 +              req.features = BIT_ULL(NPC_DMAC);
 +      }
 +
 +      req.chan_mask = 0xFFFU;
 +      if (chan_cnt > 1) {
 +              if (!is_power_of_2(chan_cnt)) {
 +                      dev_err(rvu->dev,
 +                              "%s: channel count more than 1, must be power of 2\n", __func__);
 +                      return;
 +              }
 +              relaxed_mask = GENMASK_ULL(BITS_PER_LONG_LONG - 1,
 +                                         ilog2(chan_cnt));
 +              req.chan_mask &= relaxed_mask;
 +      }
 +
 +      req.channel = chan;
 +      req.intf = pfvf->nix_rx_intf;
 +      req.entry = index;
 +      req.op = action.op;
 +      req.hdr.pcifunc = 0; /* AF is requester */
 +      req.vf = pcifunc;
 +      req.index = action.index;
 +      req.match_id = action.match_id;
 +      req.flow_key_alg = action.flow_key_alg;
 +
 +      rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
  }
  
  static void npc_enadis_promisc_entry(struct rvu *rvu, u16 pcifunc,
@@@ -750,14 -728,12 +750,14 @@@ void rvu_npc_enable_promisc_entry(struc
  void rvu_npc_install_bcast_match_entry(struct rvu *rvu, u16 pcifunc,
                                       int nixlf, u64 chan)
  {
 +      struct rvu_pfvf *pfvf;
 +      struct npc_install_flow_req req = { 0 };
 +      struct npc_install_flow_rsp rsp = { 0 };
        struct npc_mcam *mcam = &rvu->hw->mcam;
 -      struct mcam_entry entry = { {0} };
        struct rvu_hwinfo *hw = rvu->hw;
 -      struct nix_rx_action action;
 -      struct rvu_pfvf *pfvf;
        int blkaddr, index;
 +      u32 req_index = 0;
 +      u8 op;
  
        blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
        if (blkaddr < 0)
        index = npc_get_nixlf_mcam_index(mcam, pcifunc,
                                         nixlf, NIXLF_BCAST_ENTRY);
  
 -      /* Match ingress channel */
 -      entry.kw[0] = chan;
 -      entry.kw_mask[0] = 0xfffull;
 -
 -      /* Match broadcast MAC address.
 -       * DMAC is extracted at 0th bit of PARSE_KEX::KW1
 -       */
 -      entry.kw[1] = 0xffffffffffffull;
 -      entry.kw_mask[1] = 0xffffffffffffull;
 -
 -      *(u64 *)&action = 0x00;
        if (!hw->cap.nix_rx_multicast) {
                /* Early silicon doesn't support pkt replication,
                 * so install entry with UCAST action, so that PF
                 * receives all broadcast packets.
                 */
 -              action.op = NIX_RX_ACTIONOP_UCAST;
 -              action.pf_func = pcifunc;
 +              op = NIX_RX_ACTIONOP_UCAST;
        } else {
 -              action.index = pfvf->bcast_mce_idx;
 -              action.op = NIX_RX_ACTIONOP_MCAST;
 +              op = NIX_RX_ACTIONOP_MCAST;
 +              req_index = pfvf->bcast_mce_idx;
        }
  
 -      entry.action = *(u64 *)&action;
 -      npc_config_mcam_entry(rvu, mcam, blkaddr, index,
 -                            pfvf->nix_rx_intf, &entry, true);
 +      eth_broadcast_addr((u8 *)&req.packet.dmac);
 +      eth_broadcast_addr((u8 *)&req.mask.dmac);
 +      req.features = BIT_ULL(NPC_DMAC);
 +      req.channel = chan;
 +      req.intf = pfvf->nix_rx_intf;
 +      req.entry = index;
 +      req.op = op;
 +      req.hdr.pcifunc = 0; /* AF is requester */
 +      req.vf = pcifunc;
 +      req.index = req_index;
 +
 +      rvu_mbox_handler_npc_install_flow(rvu, &req, &rsp);
  }
  
  void rvu_npc_enable_bcast_entry(struct rvu *rvu, u16 pcifunc, bool enable)
@@@ -988,7 -967,7 +988,7 @@@ void rvu_npc_disable_mcam_entries(struc
  {
        struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
        struct npc_mcam *mcam = &rvu->hw->mcam;
 -      struct rvu_npc_mcam_rule *rule;
 +      struct rvu_npc_mcam_rule *rule, *tmp;
        int blkaddr;
  
        blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
        mutex_lock(&mcam->lock);
  
        /* Disable MCAM entries directing traffic to this 'pcifunc' */
 -      list_for_each_entry(rule, &mcam->mcam_rules, list) {
 +      list_for_each_entry_safe(rule, tmp, &mcam->mcam_rules, list) {
                if (is_npc_intf_rx(rule->intf) &&
                    rule->rx_action.pf_func == pcifunc) {
                        npc_enable_mcam_entry(rvu, mcam, blkaddr,
                                              rule->entry, false);
                        rule->enable = false;
                        /* Indicate that default rule is disabled */
 -                      if (rule->default_rule)
 +                      if (rule->default_rule) {
                                pfvf->def_ucast_rule = NULL;
 +                              list_del(&rule->list);
 +                              kfree(rule);
 +                      }
                }
        }
  
@@@ -1698,9 -1674,6 +1698,9 @@@ void rvu_npc_get_mcam_counter_alloc_inf
  static int npc_mcam_verify_entry(struct npc_mcam *mcam,
                                 u16 pcifunc, int entry)
  {
 +      /* verify AF installed entries */
 +      if (is_pffunc_af(pcifunc))
 +              return 0;
        /* Verify if entry is valid and if it is indeed
         * allocated to the requesting PFFUNC.
         */
@@@ -2295,10 -2268,6 +2295,10 @@@ int rvu_mbox_handler_npc_mcam_write_ent
                goto exit;
        }
  
 +      /* For AF installed rules, the nix_intf should be set to target NIX */
 +      if (is_pffunc_af(req->hdr.pcifunc))
 +              nix_intf = req->intf;
 +
        npc_config_mcam_entry(rvu, mcam, blkaddr, req->entry, nix_intf,
                              &req->entry_data, req->enable_entry);
  
@@@ -2521,10 -2490,10 +2521,10 @@@ int rvu_mbox_handler_npc_mcam_free_coun
                index = find_next_bit(mcam->bmap, mcam->bmap_entries, entry);
                if (index >= mcam->bmap_entries)
                        break;
+               entry = index + 1;
                if (mcam->entry2cntr_map[index] != req->cntr)
                        continue;
  
-               entry = index + 1;
                npc_unmap_mcam_entry_and_cntr(rvu, mcam, blkaddr,
                                              index, req->cntr);
        }
@@@ -2761,6 -2730,30 +2761,6 @@@ int rvu_mbox_handler_npc_get_kex_cfg(st
        return 0;
  }
  
 -bool rvu_npc_write_default_rule(struct rvu *rvu, int blkaddr, int nixlf,
 -                              u16 pcifunc, u8 intf, struct mcam_entry *entry,
 -                              int *index)
 -{
 -      struct rvu_pfvf *pfvf = rvu_get_pfvf(rvu, pcifunc);
 -      struct npc_mcam *mcam = &rvu->hw->mcam;
 -      bool enable;
 -      u8 nix_intf;
 -
 -      if (is_npc_intf_tx(intf))
 -              nix_intf = pfvf->nix_tx_intf;
 -      else
 -              nix_intf = pfvf->nix_rx_intf;
 -
 -      *index = npc_get_nixlf_mcam_index(mcam, pcifunc,
 -                                        nixlf, NIXLF_UCAST_ENTRY);
 -      /* dont force enable unicast entry  */
 -      enable = is_mcam_entry_enabled(rvu, mcam, blkaddr, *index);
 -      npc_config_mcam_entry(rvu, mcam, blkaddr, *index, nix_intf,
 -                            entry, enable);
 -
 -      return enable;
 -}
 -
  int rvu_mbox_handler_npc_read_base_steer_rule(struct rvu *rvu,
                                              struct msg_req *req,
                                              struct npc_mcam_read_base_rule_rsp *rsp)
@@@ -2806,42 -2799,3 +2806,42 @@@ read_entry
  out:
        return rc;
  }
 +
 +int rvu_mbox_handler_npc_mcam_entry_stats(struct rvu *rvu,
 +                                        struct npc_mcam_get_stats_req *req,
 +                                        struct npc_mcam_get_stats_rsp *rsp)
 +{
 +      struct npc_mcam *mcam = &rvu->hw->mcam;
 +      u16 index, cntr;
 +      int blkaddr;
 +      u64 regval;
 +      u32 bank;
 +
 +      blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NPC, 0);
 +      if (blkaddr < 0)
 +              return NPC_MCAM_INVALID_REQ;
 +
 +      mutex_lock(&mcam->lock);
 +
 +      index = req->entry & (mcam->banksize - 1);
 +      bank = npc_get_bank(mcam, req->entry);
 +
 +      /* read MCAM entry STAT_ACT register */
 +      regval = rvu_read64(rvu, blkaddr, NPC_AF_MCAMEX_BANKX_STAT_ACT(index, bank));
 +
 +      if (!(regval & BIT_ULL(9))) {
 +              rsp->stat_ena = 0;
 +              mutex_unlock(&mcam->lock);
 +              return 0;
 +      }
 +
 +      cntr = regval & 0x1FF;
 +
 +      rsp->stat_ena = 1;
 +      rsp->stat = rvu_read64(rvu, blkaddr, NPC_AF_MATCH_STATX(cntr));
 +      rsp->stat &= BIT_ULL(48) - 1;
 +
 +      mutex_unlock(&mcam->lock);
 +
 +      return 0;
 +}
@@@ -57,13 -57,10 +57,13 @@@ int otx2_alloc_mcam_entries(struct otx2
                flow_cfg->ntuple_max_flows = rsp->count;
                flow_cfg->ntuple_offset = 0;
                pfvf->flags |= OTX2_FLAG_NTUPLE_SUPPORT;
 +              flow_cfg->tc_max_flows = flow_cfg->ntuple_max_flows;
 +              pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
        } else {
                flow_cfg->vf_vlan_offset = 0;
                flow_cfg->ntuple_offset = flow_cfg->vf_vlan_offset +
                                                vf_vlan_max_flows;
 +              flow_cfg->tc_flower_offset = flow_cfg->ntuple_offset;
                flow_cfg->unicast_offset = flow_cfg->ntuple_offset +
                                                OTX2_MAX_NTUPLE_FLOWS;
                flow_cfg->rx_vlan_offset = flow_cfg->unicast_offset +
@@@ -72,7 -69,6 +72,7 @@@
                pfvf->flags |= OTX2_FLAG_UCAST_FLTR_SUPPORT;
                pfvf->flags |= OTX2_FLAG_RX_VLAN_SUPPORT;
                pfvf->flags |= OTX2_FLAG_VF_VLAN_SUPPORT;
 +              pfvf->flags |= OTX2_FLAG_TC_FLOWER_SUPPORT;
        }
  
        for (i = 0; i < rsp->count; i++)
@@@ -97,7 -93,6 +97,7 @@@ int otx2_mcam_flow_init(struct otx2_ni
        INIT_LIST_HEAD(&pf->flow_cfg->flow_list);
  
        pf->flow_cfg->ntuple_max_flows = OTX2_MAX_NTUPLE_FLOWS;
 +      pf->flow_cfg->tc_max_flows = pf->flow_cfg->ntuple_max_flows;
  
        err = otx2_alloc_mcam_entries(pf);
        if (err)
@@@ -262,17 -257,19 +262,19 @@@ int otx2_get_flow(struct otx2_nic *pfvf
  int otx2_get_all_flows(struct otx2_nic *pfvf, struct ethtool_rxnfc *nfc,
                       u32 *rule_locs)
  {
+       u32 rule_cnt = nfc->rule_cnt;
        u32 location = 0;
        int idx = 0;
        int err = 0;
  
        nfc->data = pfvf->flow_cfg->ntuple_max_flows;
-       while ((!err || err == -ENOENT) && idx < nfc->rule_cnt) {
+       while ((!err || err == -ENOENT) && idx < rule_cnt) {
                err = otx2_get_flow(pfvf, nfc, location);
                if (!err)
                        rule_locs[idx++] = location;
                location++;
        }
+       nfc->rule_cnt = rule_cnt;
  
        return err;
  }
@@@ -306,35 -303,6 +308,35 @@@ static int otx2_prepare_ipv4_flow(struc
                               sizeof(pmask->ip4dst));
                        req->features |= BIT_ULL(NPC_DIP_IPV4);
                }
 +              if (ipv4_usr_mask->tos) {
 +                      pkt->tos = ipv4_usr_hdr->tos;
 +                      pmask->tos = ipv4_usr_mask->tos;
 +                      req->features |= BIT_ULL(NPC_TOS);
 +              }
 +              if (ipv4_usr_mask->proto) {
 +                      switch (ipv4_usr_hdr->proto) {
 +                      case IPPROTO_ICMP:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_ICMP);
 +                              break;
 +                      case IPPROTO_TCP:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_TCP);
 +                              break;
 +                      case IPPROTO_UDP:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_UDP);
 +                              break;
 +                      case IPPROTO_SCTP:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_SCTP);
 +                              break;
 +                      case IPPROTO_AH:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_AH);
 +                              break;
 +                      case IPPROTO_ESP:
 +                              req->features |= BIT_ULL(NPC_IPPROTO_ESP);
 +                              break;
 +                      default:
 +                              return -EOPNOTSUPP;
 +                      }
 +              }
                pkt->etype = cpu_to_be16(ETH_P_IP);
                pmask->etype = cpu_to_be16(0xFFFF);
                req->features |= BIT_ULL(NPC_ETYPE);
                               sizeof(pmask->ip4dst));
                        req->features |= BIT_ULL(NPC_DIP_IPV4);
                }
 +              if (ipv4_l4_mask->tos) {
 +                      pkt->tos = ipv4_l4_hdr->tos;
 +                      pmask->tos = ipv4_l4_mask->tos;
 +                      req->features |= BIT_ULL(NPC_TOS);
 +              }
                if (ipv4_l4_mask->psrc) {
                        memcpy(&pkt->sport, &ipv4_l4_hdr->psrc,
                               sizeof(pkt->sport));
                               sizeof(pmask->ip4dst));
                        req->features |= BIT_ULL(NPC_DIP_IPV4);
                }
 +              if (ah_esp_mask->tos) {
 +                      pkt->tos = ah_esp_hdr->tos;
 +                      pmask->tos = ah_esp_mask->tos;
 +                      req->features |= BIT_ULL(NPC_TOS);
 +              }
  
                /* NPC profile doesn't extract AH/ESP header fields */
 -              if ((ah_esp_mask->spi & ah_esp_hdr->spi) ||
 -                  (ah_esp_mask->tos & ah_esp_mask->tos))
 +              if (ah_esp_mask->spi & ah_esp_hdr->spi)
                        return -EOPNOTSUPP;
  
                if (flow_type == AH_V4_FLOW)
@@@ -1672,6 -1672,7 +1672,7 @@@ int otx2_stop(struct net_device *netdev
        struct otx2_nic *pf = netdev_priv(netdev);
        struct otx2_cq_poll *cq_poll = NULL;
        struct otx2_qset *qset = &pf->qset;
+       struct otx2_rss_info *rss;
        int qidx, vec, wrk;
  
        netif_carrier_off(netdev);
        /* First stop packet Rx/Tx */
        otx2_rxtx_enable(pf, false);
  
+       /* Clear RSS enable flag */
+       rss = &pf->hw.rss_info;
+       rss->enable = false;
        /* Cleanup Queue IRQ */
        vec = pci_irq_vector(pf->pdev,
                             pf->hw.nix_msixoff + NIX_LF_QINT_VEC_START);
@@@ -1760,24 -1765,6 +1765,24 @@@ static netdev_tx_t otx2_xmit(struct sk_
        return NETDEV_TX_OK;
  }
  
 +static netdev_features_t otx2_fix_features(struct net_device *dev,
 +                                         netdev_features_t features)
 +{
 +      /* check if n-tuple filters are ON */
 +      if ((features & NETIF_F_HW_TC) && (dev->features & NETIF_F_NTUPLE)) {
 +              netdev_info(dev, "Disabling n-tuple filters\n");
 +              features &= ~NETIF_F_NTUPLE;
 +      }
 +
 +      /* check if tc hw offload is ON */
 +      if ((features & NETIF_F_NTUPLE) && (dev->features & NETIF_F_HW_TC)) {
 +              netdev_info(dev, "Disabling TC hardware offload\n");
 +              features &= ~NETIF_F_HW_TC;
 +      }
 +
 +      return features;
 +}
 +
  static void otx2_set_rx_mode(struct net_device *netdev)
  {
        struct otx2_nic *pf = netdev_priv(netdev);
@@@ -1840,12 -1827,6 +1845,12 @@@ static int otx2_set_features(struct net
        if ((changed & NETIF_F_NTUPLE) && !ntuple)
                otx2_destroy_ntuple_flows(pf);
  
 +      if ((netdev->features & NETIF_F_HW_TC) > (features & NETIF_F_HW_TC) &&
 +          pf->tc_info.num_entries) {
 +              netdev_err(netdev, "Can't disable TC hardware offload while flows are active\n");
 +              return -EBUSY;
 +      }
 +
        return 0;
  }
  
@@@ -2244,7 -2225,6 +2249,7 @@@ static const struct net_device_ops otx2
        .ndo_open               = otx2_open,
        .ndo_stop               = otx2_stop,
        .ndo_start_xmit         = otx2_xmit,
 +      .ndo_fix_features       = otx2_fix_features,
        .ndo_set_mac_address    = otx2_set_mac_address,
        .ndo_change_mtu         = otx2_change_mtu,
        .ndo_set_rx_mode        = otx2_set_rx_mode,
        .ndo_set_vf_mac         = otx2_set_vf_mac,
        .ndo_set_vf_vlan        = otx2_set_vf_vlan,
        .ndo_get_vf_config      = otx2_get_vf_config,
 +      .ndo_setup_tc           = otx2_setup_tc,
  };
  
  static int otx2_wq_init(struct otx2_nic *pf)
@@@ -2475,10 -2454,6 +2480,10 @@@ static int otx2_probe(struct pci_dev *p
                                       NETIF_F_HW_VLAN_STAG_RX;
        netdev->features |= netdev->hw_features;
  
 +      /* HW supports tc offload but mutually exclusive with n-tuple filters */
 +      if (pf->flags & OTX2_FLAG_TC_FLOWER_SUPPORT)
 +              netdev->hw_features |= NETIF_F_HW_TC;
 +
        netdev->gso_max_segs = OTX2_MAX_GSO_SEGS;
        netdev->watchdog_timeo = OTX2_TX_TIMEOUT;
  
  
        otx2_set_ethtool_ops(netdev);
  
 +      err = otx2_init_tc(pf);
 +      if (err)
 +              goto err_mcam_flow_del;
 +
        /* Enable link notifications */
        otx2_cgx_config_linkevents(pf, true);
  
  
        return 0;
  
 +err_mcam_flow_del:
 +      otx2_mcam_flow_del(pf);
  err_unreg_netdev:
        unregister_netdev(netdev);
  err_del_mcam_entries:
@@@ -2682,7 -2651,6 +2687,7 @@@ static void otx2_remove(struct pci_dev 
  
        otx2_ptp_destroy(pf);
        otx2_mcam_flow_del(pf);
 +      otx2_shutdown_tc(pf);
        otx2_detach_resources(&pf->mbox);
        if (pf->hw.lmt_base)
                iounmap(pf->hw.lmt_base);
@@@ -92,14 -92,15 +92,15 @@@ struct page_pool
                                    MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT : 0)
  #define MLX5_MPWRQ_PAGES_PER_WQE              BIT(MLX5_MPWRQ_WQE_PAGE_ORDER)
  
- #define MLX5_MTT_OCTW(npages) (ALIGN(npages, 8) / 2)
+ #define MLX5_ALIGN_MTTS(mtts)         (ALIGN(mtts, 8))
+ #define MLX5_ALIGNED_MTTS_OCTW(mtts)  ((mtts) / 2)
+ #define MLX5_MTT_OCTW(mtts)           (MLX5_ALIGNED_MTTS_OCTW(MLX5_ALIGN_MTTS(mtts)))
  /* Add another page to MLX5E_REQUIRED_WQE_MTTS as a buffer between
   * WQEs, This page will absorb write overflow by the hardware, when
   * receiving packets larger than MTU. These oversize packets are
   * dropped by the driver at a later stage.
   */
- #define MLX5E_REQUIRED_WQE_MTTS               (ALIGN(MLX5_MPWRQ_PAGES_PER_WQE + 1, 8))
- #define MLX5E_LOG_ALIGNED_MPWQE_PPW   (ilog2(MLX5E_REQUIRED_WQE_MTTS))
+ #define MLX5E_REQUIRED_WQE_MTTS               (MLX5_ALIGN_MTTS(MLX5_MPWRQ_PAGES_PER_WQE + 1))
  #define MLX5E_REQUIRED_MTTS(wqes)     (wqes * MLX5E_REQUIRED_WQE_MTTS)
  #define MLX5E_MAX_RQ_NUM_MTTS \
        ((1 << 16) * 2) /* So that MLX5_MTT_OCTW(num_mtts) fits into u16 */
@@@ -880,6 -881,7 +881,6 @@@ struct mlx5e_priv 
  #endif
        struct devlink_health_reporter *tx_reporter;
        struct devlink_health_reporter *rx_reporter;
 -      struct devlink_port            dl_port;
        struct mlx5e_xsk           xsk;
  #if IS_ENABLED(CONFIG_PCI_HYPERV_INTERFACE)
        struct mlx5e_hv_vhca_stats_agent stats_agent;
@@@ -1173,7 -1175,6 +1174,7 @@@ void mlx5e_detach_netdev(struct mlx5e_p
  void mlx5e_destroy_netdev(struct mlx5e_priv *priv);
  int mlx5e_netdev_change_profile(struct mlx5e_priv *priv,
                                const struct mlx5e_profile *new_profile, void *new_ppriv);
 +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv);
  void mlx5e_set_netdev_mtu_boundaries(struct mlx5e_priv *priv);
  void mlx5e_build_nic_params(struct mlx5e_priv *priv, struct mlx5e_xsk *xsk, u16 mtu);
  void mlx5e_build_rq_params(struct mlx5_core_dev *mdev,
@@@ -695,7 -695,7 +695,7 @@@ mlx5_tc_ct_entry_add_rule(struct mlx5_t
  
        zone_rule->nat = nat;
  
 -      spec = kzalloc(sizeof(*spec), GFP_KERNEL);
 +      spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
        if (!spec)
                return -ENOMEM;
  
  
        zone_rule->attr = attr;
  
 -      kfree(spec);
 +      kvfree(spec);
        ct_dbg("Offloaded ct entry rule in zone %d", entry->tuple.zone);
  
        return 0;
@@@ -749,7 -749,7 +749,7 @@@ err_rule
  err_mod_hdr:
        kfree(attr);
  err_attr:
 -      kfree(spec);
 +      kvfree(spec);
        return err;
  }
  
@@@ -1181,7 -1181,8 +1181,8 @@@ int mlx5_tc_ct_add_no_trk_match(struct 
  
        mlx5e_tc_match_to_reg_get_match(spec, CTSTATE_TO_REG,
                                        &ctstate, &ctstate_mask);
-       if (ctstate_mask)
+       if ((ctstate & ctstate_mask) == MLX5_CT_STATE_TRK_BIT)
                return -EOPNOTSUPP;
  
        ctstate_mask |= MLX5_CT_STATE_TRK_BIT;
@@@ -1539,14 -1540,6 +1540,14 @@@ mlx5_tc_ct_free_pre_ct_tables(struct ml
        mlx5_tc_ct_free_pre_ct(ft, &ft->pre_ct);
  }
  
 +/* To avoid false lock dependency warning set the ct_entries_ht lock
 + * class different than the lock class of the ht being used when deleting
 + * last flow from a group and then deleting a group, we get into del_sw_flow_group()
 + * which call rhashtable_destroy on fg->ftes_hash which will take ht->mutex but
 + * it's different than the ht->mutex here.
 + */
 +static struct lock_class_key ct_entries_ht_lock_key;
 +
  static struct mlx5_ct_ft *
  mlx5_tc_ct_add_ft_cb(struct mlx5_tc_ct_priv *ct_priv, u16 zone,
                     struct nf_flowtable *nf_ft)
        if (err)
                goto err_init;
  
 +      lockdep_set_class(&ft->ct_entries_ht.mutex, &ct_entries_ht_lock_key);
 +
        err = rhashtable_insert_fast(&ct_priv->zone_ht, &ft->node,
                                     zone_params);
        if (err)
@@@ -1684,10 -1675,10 +1685,10 @@@ __mlx5_tc_ct_flow_offload(struct mlx5_t
        struct mlx5_ct_ft *ft;
        u32 fte_id = 1;
  
 -      post_ct_spec = kzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
 +      post_ct_spec = kvzalloc(sizeof(*post_ct_spec), GFP_KERNEL);
        ct_flow = kzalloc(sizeof(*ct_flow), GFP_KERNEL);
        if (!post_ct_spec || !ct_flow) {
 -              kfree(post_ct_spec);
 +              kvfree(post_ct_spec);
                kfree(ct_flow);
                return ERR_PTR(-ENOMEM);
        }
        ct_flow->post_ct_attr->prio = 0;
        ct_flow->post_ct_attr->ft = ct_priv->post_ct;
  
 +      /* Splits were handled before CT */
 +      if (ct_priv->ns_type == MLX5_FLOW_NAMESPACE_FDB)
 +              ct_flow->post_ct_attr->esw_attr->split_count = 0;
 +
        ct_flow->post_ct_attr->inner_match_level = MLX5_MATCH_NONE;
        ct_flow->post_ct_attr->outer_match_level = MLX5_MATCH_NONE;
        ct_flow->post_ct_attr->action &= ~(MLX5_FLOW_CONTEXT_ACTION_DECAP);
  
        attr->ct_attr.ct_flow = ct_flow;
        dealloc_mod_hdr_actions(&pre_mod_acts);
 -      kfree(post_ct_spec);
 +      kvfree(post_ct_spec);
  
        return rule;
  
@@@ -1847,7 -1834,7 +1848,7 @@@ err_alloc_pre
  err_idr:
        mlx5_tc_ct_del_ft_cb(ct_priv, ft);
  err_ft:
 -      kfree(post_ct_spec);
 +      kvfree(post_ct_spec);
        kfree(ct_flow);
        netdev_warn(priv->netdev, "Failed to offload ct flow, err %d\n", err);
        return ERR_PTR(err);
@@@ -2,7 -2,6 +2,7 @@@
  /* Copyright (c) 2021 Mellanox Technologies. */
  
  #include <net/fib_notifier.h>
 +#include <net/nexthop.h>
  #include "tc_tun_encap.h"
  #include "en_tc.h"
  #include "tc_tun.h"
@@@ -90,6 -89,7 +90,7 @@@ int mlx5e_tc_set_attr_rx_tun(struct mlx
         * required to establish routing.
         */
        flow_flag_set(flow, TUN_RX);
+       flow->attr->tun_ip_version = ip_version;
        return 0;
  }
  
@@@ -1092,7 -1092,7 +1093,7 @@@ int mlx5e_attach_decap_route(struct mlx
        if (err || !esw_attr->rx_tun_attr->decap_vport)
                goto out;
  
-       key.ip_version = attr->ip_version;
+       key.ip_version = attr->tun_ip_version;
        if (key.ip_version == 4)
                key.endpoint_ip.v4 = esw_attr->rx_tun_attr->dst_ip.v4;
        else
@@@ -302,7 -302,7 +302,7 @@@ static int mlx5e_create_umr_mkey(struc
        MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
        mlx5e_mkey_set_relaxed_ordering(mdev, mkc);
        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 -      MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.pdn);
 +      MLX5_SET(mkc, mkc, pd, mdev->mlx5e_res.hw_objs.pdn);
        MLX5_SET64(mkc, mkc, len, npages << page_shift);
        MLX5_SET(mkc, mkc, translations_octword_size,
                 MLX5_MTT_OCTW(npages));
@@@ -334,9 -334,9 +334,9 @@@ static int mlx5e_create_rq_umr_mkey(str
                                     rq->wqe_overflow.addr);
  }
  
- static inline u64 mlx5e_get_mpwqe_offset(struct mlx5e_rq *rq, u16 wqe_ix)
+ static u64 mlx5e_get_mpwqe_offset(u16 wqe_ix)
  {
-       return (wqe_ix << MLX5E_LOG_ALIGNED_MPWQE_PPW) << PAGE_SHIFT;
+       return MLX5E_REQUIRED_MTTS(wqe_ix) << PAGE_SHIFT;
  }
  
  static void mlx5e_init_frags_partition(struct mlx5e_rq *rq)
@@@ -577,7 -577,7 +577,7 @@@ static int mlx5e_alloc_rq(struct mlx5e_
                                mlx5_wq_ll_get_wqe(&rq->mpwqe.wq, i);
                        u32 byte_count =
                                rq->mpwqe.num_strides << rq->mpwqe.log_stride_sz;
-                       u64 dma_offset = mlx5e_get_mpwqe_offset(rq, i);
+                       u64 dma_offset = mlx5e_get_mpwqe_offset(i);
  
                        wqe->data[0].addr = cpu_to_be64(dma_offset + rq->buff.headroom);
                        wqe->data[0].byte_count = cpu_to_be32(byte_count);
@@@ -1019,7 -1019,7 +1019,7 @@@ static int mlx5e_alloc_xdpsq(struct mlx
        sq->pdev      = c->pdev;
        sq->mkey_be   = c->mkey_be;
        sq->channel   = c;
 -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
        sq->min_inline_mode = params->tx_min_inline_mode;
        sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
        sq->xsk_pool  = xsk_pool;
@@@ -1090,7 -1090,7 +1090,7 @@@ static int mlx5e_alloc_icosq(struct mlx
        int err;
  
        sq->channel   = c;
 -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
  
        param->wq.db_numa_node = cpu_to_node(c->cpu);
        err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, wq, &sq->wq_ctrl);
@@@ -1174,7 -1174,7 +1174,7 @@@ static int mlx5e_alloc_txqsq(struct mlx
        sq->priv      = c->priv;
        sq->ch_ix     = c->ix;
        sq->txq_ix    = txq_ix;
 -      sq->uar_map   = mdev->mlx5e_res.bfreg.map;
 +      sq->uar_map   = mdev->mlx5e_res.hw_objs.bfreg.map;
        sq->min_inline_mode = params->tx_min_inline_mode;
        sq->hw_mtu    = MLX5E_SW2HW_MTU(params, params->sw_mtu);
        INIT_WORK(&sq->recover_work, mlx5e_tx_err_cqe_work);
@@@ -1257,7 -1257,7 +1257,7 @@@ static int mlx5e_create_sq(struct mlx5_
        MLX5_SET(sqc,  sqc, flush_in_error_en, 1);
  
        MLX5_SET(wq,   wq, wq_type,       MLX5_WQ_TYPE_CYCLIC);
 -      MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.bfreg.index);
 +      MLX5_SET(wq,   wq, uar_page,      mdev->mlx5e_res.hw_objs.bfreg.index);
        MLX5_SET(wq,   wq, log_wq_pg_sz,  csp->wq_ctrl->buf.page_shift -
                                          MLX5_ADAPTER_PAGE_SHIFT);
        MLX5_SET64(wq, wq, dbr_addr,      csp->wq_ctrl->db.dma);
@@@ -2032,7 -2032,7 +2032,7 @@@ static int mlx5e_open_channel(struct ml
        c->cpu      = cpu;
        c->pdev     = mlx5_core_dma_dev(priv->mdev);
        c->netdev   = priv->netdev;
 -      c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
 +      c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.hw_objs.mkey.key);
        c->num_tc   = params->num_tc;
        c->xdp      = !!params->xdp_prog;
        c->stats    = &priv->channel_stats[ix].ch;
@@@ -2217,7 -2217,7 +2217,7 @@@ void mlx5e_build_rq_param(struct mlx5e_
        MLX5_SET(wq, wq, end_padding_mode, MLX5_WQ_END_PAD_MODE_ALIGN);
        MLX5_SET(wq, wq, log_wq_stride,
                 mlx5e_get_rqwq_log_stride(params->rq_wq_type, ndsegs));
 -      MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.pdn);
 +      MLX5_SET(wq, wq, pd,               mdev->mlx5e_res.hw_objs.pdn);
        MLX5_SET(rqc, rqc, counter_set_id, priv->q_counter);
        MLX5_SET(rqc, rqc, vsd,            params->vlan_strip_disable);
        MLX5_SET(rqc, rqc, scatter_fcs,    params->scatter_fcs_en);
@@@ -2248,7 -2248,7 +2248,7 @@@ void mlx5e_build_sq_param_common(struc
        void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
  
        MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
 -      MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.pdn);
 +      MLX5_SET(wq, wq, pd,            priv->mdev->mlx5e_res.hw_objs.pdn);
  
        param->wq.buf_numa_node = dev_to_node(mlx5_core_dma_dev(priv->mdev));
  }
@@@ -2368,8 -2368,9 +2368,9 @@@ static u8 mlx5e_build_icosq_log_wq_sz(s
  {
        switch (params->rq_wq_type) {
        case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
-               return order_base_2(MLX5E_UMR_WQEBBS) +
-                       mlx5e_get_rq_log_wq_sz(rqp->rqc);
+               return max_t(u8, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE,
+                            order_base_2(MLX5E_UMR_WQEBBS) +
+                            mlx5e_get_rq_log_wq_sz(rqp->rqc));
        default: /* MLX5_WQ_TYPE_CYCLIC */
                return MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE;
        }
@@@ -2502,8 -2503,10 +2503,10 @@@ void mlx5e_close_channels(struct mlx5e_
  {
        int i;
  
-       if (chs->port_ptp)
+       if (chs->port_ptp) {
                mlx5e_port_ptp_close(chs->port_ptp);
+               chs->port_ptp = NULL;
+       }
  
        for (i = 0; i < chs->num; i++)
                mlx5e_close_channel(chs->c[i]);
@@@ -3421,10 -3424,10 +3424,10 @@@ int mlx5e_create_tis(struct mlx5_core_d
  {
        void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
  
 -      MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.td.tdn);
 +      MLX5_SET(tisc, tisc, transport_domain, mdev->mlx5e_res.hw_objs.td.tdn);
  
        if (MLX5_GET(tisc, tisc, tls_en))
 -              MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.pdn);
 +              MLX5_SET(tisc, tisc, pd, mdev->mlx5e_res.hw_objs.pdn);
  
        if (mlx5_lag_is_lacp_owner(mdev))
                MLX5_SET(tisc, tisc, strict_lag_tx_port_affinity, 1);
@@@ -3494,7 -3497,7 +3497,7 @@@ static void mlx5e_cleanup_nic_tx(struc
  static void mlx5e_build_indir_tir_ctx_common(struct mlx5e_priv *priv,
                                             u32 rqtn, u32 *tirc)
  {
 -      MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.td.tdn);
 +      MLX5_SET(tirc, tirc, transport_domain, priv->mdev->mlx5e_res.hw_objs.td.tdn);
        MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
        MLX5_SET(tirc, tirc, indirect_table, rqtn);
        MLX5_SET(tirc, tirc, tunneled_offload_en,
@@@ -3769,16 -3772,8 +3772,16 @@@ static int mlx5e_setup_tc(struct net_de
                          void *type_data)
  {
        struct mlx5e_priv *priv = netdev_priv(dev);
 +      bool tc_unbind = false;
        int err;
  
 +      if (type == TC_SETUP_BLOCK &&
 +          ((struct flow_block_offload *)type_data)->command == FLOW_BLOCK_UNBIND)
 +              tc_unbind = true;
 +
 +      if (!netif_device_present(dev) && !tc_unbind)
 +              return -ENODEV;
 +
        switch (type) {
        case TC_SETUP_BLOCK: {
                struct flow_block_offload *f = type_data;
@@@ -3818,6 -3813,15 +3821,15 @@@ void mlx5e_fold_sw_stats64(struct mlx5e
                for (j = 0; j < priv->max_opened_tc; j++) {
                        struct mlx5e_sq_stats *sq_stats = &channel_stats->sq[j];
  
+                       s->tx_packets    += sq_stats->packets;
+                       s->tx_bytes      += sq_stats->bytes;
+                       s->tx_dropped    += sq_stats->dropped;
+               }
+       }
+       if (priv->port_ptp_opened) {
+               for (i = 0; i < priv->max_opened_tc; i++) {
+                       struct mlx5e_sq_stats *sq_stats = &priv->port_ptp_stats.sq[i];
                        s->tx_packets    += sq_stats->packets;
                        s->tx_bytes      += sq_stats->bytes;
                        s->tx_dropped    += sq_stats->dropped;
@@@ -3831,9 -3835,6 +3843,9 @@@ mlx5e_get_stats(struct net_device *dev
        struct mlx5e_priv *priv = netdev_priv(dev);
        struct mlx5e_pport_stats *pstats = &priv->stats.pport;
  
 +      if (!netif_device_present(dev))
 +              return;
 +
        /* In switchdev mode, monitor counters doesn't monitor
         * rx/tx stats of 802_3. The update stats mechanism
         * should keep the 802_3 layout counters updated
        }
  
        if (mlx5e_is_uplink_rep(priv)) {
+               struct mlx5e_vport_stats *vstats = &priv->stats.vport;
                stats->rx_packets = PPORT_802_3_GET(pstats, a_frames_received_ok);
                stats->rx_bytes   = PPORT_802_3_GET(pstats, a_octets_received_ok);
                stats->tx_packets = PPORT_802_3_GET(pstats, a_frames_transmitted_ok);
                stats->tx_bytes   = PPORT_802_3_GET(pstats, a_octets_transmitted_ok);
+               /* vport multicast also counts packets that are dropped due to steering
+                * or rx out of buffer
+                */
+               stats->multicast = VPORT_COUNTER_GET(vstats, received_eth_multicast.packets);
        } else {
                mlx5e_fold_sw_stats64(priv, stats);
        }
        stats->tx_errors = stats->tx_aborted_errors + stats->tx_carrier_errors;
  }
  
 +static void mlx5e_nic_set_rx_mode(struct mlx5e_priv *priv)
 +{
 +      if (mlx5e_is_uplink_rep(priv))
 +              return; /* no rx mode for uplink rep */
 +
 +      queue_work(priv->wq, &priv->set_rx_mode_work);
 +}
 +
  static void mlx5e_set_rx_mode(struct net_device *dev)
  {
        struct mlx5e_priv *priv = netdev_priv(dev);
  
 -      queue_work(priv->wq, &priv->set_rx_mode_work);
 +      mlx5e_nic_set_rx_mode(priv);
  }
  
  static int mlx5e_set_mac(struct net_device *netdev, void *addr)
        ether_addr_copy(netdev->dev_addr, saddr->sa_data);
        netif_addr_unlock_bh(netdev);
  
 -      queue_work(priv->wq, &priv->set_rx_mode_work);
 +      mlx5e_nic_set_rx_mode(priv);
  
        return 0;
  }
@@@ -4433,9 -4433,6 +4452,9 @@@ static int mlx5e_set_vf_link_state(stru
        struct mlx5e_priv *priv = netdev_priv(dev);
        struct mlx5_core_dev *mdev = priv->mdev;
  
 +      if (mlx5e_is_uplink_rep(priv))
 +              return -EOPNOTSUPP;
 +
        return mlx5_eswitch_set_vport_state(mdev->priv.eswitch, vf + 1,
                                            mlx5_ifla_link2vport(link_state));
  }
@@@ -4447,9 -4444,6 +4466,9 @@@ int mlx5e_get_vf_config(struct net_devi
        struct mlx5_core_dev *mdev = priv->mdev;
        int err;
  
 +      if (!netif_device_present(dev))
 +              return -EOPNOTSUPP;
 +
        err = mlx5_eswitch_get_vport_config(mdev->priv.eswitch, vf + 1, ivi);
        if (err)
                return err;
@@@ -4466,32 -4460,6 +4485,32 @@@ int mlx5e_get_vf_stats(struct net_devic
        return mlx5_eswitch_get_vport_stats(mdev->priv.eswitch, vf + 1,
                                            vf_stats);
  }
 +
 +static bool
 +mlx5e_has_offload_stats(const struct net_device *dev, int attr_id)
 +{
 +      struct mlx5e_priv *priv = netdev_priv(dev);
 +
 +      if (!netif_device_present(dev))
 +              return false;
 +
 +      if (!mlx5e_is_uplink_rep(priv))
 +              return false;
 +
 +      return mlx5e_rep_has_offload_stats(dev, attr_id);
 +}
 +
 +static int
 +mlx5e_get_offload_stats(int attr_id, const struct net_device *dev,
 +                      void *sp)
 +{
 +      struct mlx5e_priv *priv = netdev_priv(dev);
 +
 +      if (!mlx5e_is_uplink_rep(priv))
 +              return -EOPNOTSUPP;
 +
 +      return mlx5e_rep_get_offload_stats(attr_id, dev, sp);
 +}
  #endif
  
  static bool mlx5e_tunnel_proto_supported_tx(struct mlx5_core_dev *mdev, u8 proto_type)
@@@ -4734,8 -4702,10 +4753,10 @@@ static int mlx5e_xdp_set(struct net_dev
                struct mlx5e_channel *c = priv->channels.c[i];
  
                mlx5e_rq_replace_xdp_prog(&c->rq, prog);
-               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state))
+               if (test_bit(MLX5E_CHANNEL_STATE_XSK, c->state)) {
+                       bpf_prog_inc(prog);
                        mlx5e_rq_replace_xdp_prog(&c->xskrq, prog);
+               }
        }
  
  unlock:
@@@ -4848,8 -4818,6 +4869,8 @@@ const struct net_device_ops mlx5e_netde
        .ndo_get_vf_config       = mlx5e_get_vf_config,
        .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
        .ndo_get_vf_stats        = mlx5e_get_vf_stats,
 +      .ndo_has_offload_stats   = mlx5e_has_offload_stats,
 +      .ndo_get_offload_stats   = mlx5e_get_offload_stats,
  #endif
        .ndo_get_devlink_port    = mlx5e_get_devlink_port,
  };
@@@ -5011,6 -4979,11 +5032,11 @@@ void mlx5e_build_nic_params(struct mlx5
                                     priv->max_nch);
        params->num_tc       = 1;
  
+       /* Set an initial non-zero value, so that mlx5e_select_queue won't
+        * divide by zero if called before first activating channels.
+        */
+       priv->num_tc_x_num_ch = params->num_channels * params->num_tc;
        /* SQ */
        params->log_sq_size = is_kdump_kernel() ?
                MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE :
@@@ -5306,6 -5279,10 +5332,6 @@@ static int mlx5e_nic_init(struct mlx5_c
        if (err)
                mlx5_core_err(mdev, "TLS initialization failed, %d\n", err);
  
 -      err = mlx5e_devlink_port_register(priv);
 -      if (err)
 -              mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
 -
        mlx5e_health_create_reporters(priv);
  
        return 0;
  static void mlx5e_nic_cleanup(struct mlx5e_priv *priv)
  {
        mlx5e_health_destroy_reporters(priv);
 -      mlx5e_devlink_port_unregister(priv);
        mlx5e_tls_cleanup(priv);
        mlx5e_ipsec_cleanup(priv);
  }
@@@ -5453,7 -5431,7 +5479,7 @@@ static void mlx5e_nic_enable(struct mlx
                return;
        mlx5e_dcbnl_init_app(priv);
  
 -      queue_work(priv->wq, &priv->set_rx_mode_work);
 +      mlx5e_nic_set_rx_mode(priv);
  
        rtnl_lock();
        if (netif_running(netdev))
@@@ -5476,7 -5454,7 +5502,7 @@@ static void mlx5e_nic_disable(struct ml
        netif_device_detach(priv->netdev);
        rtnl_unlock();
  
 -      queue_work(priv->wq, &priv->set_rx_mode_work);
 +      mlx5e_nic_set_rx_mode(priv);
  
        mlx5e_hv_vhca_stats_destroy(priv);
        if (mlx5e_monitor_counter_supported(priv))
@@@ -5522,8 -5500,6 +5548,6 @@@ int mlx5e_priv_init(struct mlx5e_priv *
                    struct net_device *netdev,
                    struct mlx5_core_dev *mdev)
  {
-       memset(priv, 0, sizeof(*priv));
        /* priv init */
        priv->mdev        = mdev;
        priv->netdev      = netdev;
@@@ -5556,12 -5532,18 +5580,18 @@@ void mlx5e_priv_cleanup(struct mlx5e_pr
  {
        int i;
  
+       /* bail if change profile failed and also rollback failed */
+       if (!priv->mdev)
+               return;
        destroy_workqueue(priv->wq);
        free_cpumask_var(priv->scratchpad.cpumask);
  
        for (i = 0; i < priv->htb.max_qos_sqs; i++)
                kfree(priv->htb.qos_sq_stats[i]);
        kvfree(priv->htb.qos_sq_stats);
+       memset(priv, 0, sizeof(*priv));
  }
  
  struct net_device *
@@@ -5678,11 -5660,10 +5708,10 @@@ void mlx5e_detach_netdev(struct mlx5e_p
  }
  
  static int
- mlx5e_netdev_attach_profile(struct mlx5e_priv *priv,
+ mlx5e_netdev_attach_profile(struct net_device *netdev, struct mlx5_core_dev *mdev,
                            const struct mlx5e_profile *new_profile, void *new_ppriv)
  {
-       struct net_device *netdev = priv->netdev;
-       struct mlx5_core_dev *mdev = priv->mdev;
+       struct mlx5e_priv *priv = netdev_priv(netdev);
        int err;
  
        err = mlx5e_priv_init(priv, netdev, mdev);
        priv->ppriv = new_ppriv;
        err = new_profile->init(priv->mdev, priv->netdev);
        if (err)
-               return err;
+               goto priv_cleanup;
        err = mlx5e_attach_netdev(priv);
        if (err)
-               new_profile->cleanup(priv);
+               goto profile_cleanup;
+       return err;
+ profile_cleanup:
+       new_profile->cleanup(priv);
+ priv_cleanup:
+       mlx5e_priv_cleanup(priv);
        return err;
  }
  
@@@ -5707,13 -5694,14 +5742,14 @@@ int mlx5e_netdev_change_profile(struct 
  {
        unsigned int new_max_nch = mlx5e_calc_max_nch(priv, new_profile);
        const struct mlx5e_profile *orig_profile = priv->profile;
+       struct net_device *netdev = priv->netdev;
+       struct mlx5_core_dev *mdev = priv->mdev;
        void *orig_ppriv = priv->ppriv;
        int err, rollback_err;
  
        /* sanity */
        if (new_max_nch != priv->max_nch) {
-               netdev_warn(priv->netdev,
-                           "%s: Replacing profile with different max channels\n",
+               netdev_warn(netdev, "%s: Replacing profile with different max channels\n",
                            __func__);
                return -EINVAL;
        }
        priv->profile->cleanup(priv);
        mlx5e_priv_cleanup(priv);
  
-       err = mlx5e_netdev_attach_profile(priv, new_profile, new_ppriv);
+       err = mlx5e_netdev_attach_profile(netdev, mdev, new_profile, new_ppriv);
        if (err) { /* roll back to original profile */
-               netdev_warn(priv->netdev, "%s: new profile init failed, %d\n",
-                           __func__, err);
+               netdev_warn(netdev, "%s: new profile init failed, %d\n", __func__, err);
                goto rollback;
        }
  
        return 0;
  
  rollback:
-       rollback_err = mlx5e_netdev_attach_profile(priv, orig_profile, orig_ppriv);
-       if (rollback_err) {
-               netdev_err(priv->netdev,
-                          "%s: failed to rollback to orig profile, %d\n",
+       rollback_err = mlx5e_netdev_attach_profile(netdev, mdev, orig_profile, orig_ppriv);
+       if (rollback_err)
+               netdev_err(netdev, "%s: failed to rollback to orig profile, %d\n",
                           __func__, rollback_err);
-       }
        return err;
  }
  
 +void mlx5e_netdev_attach_nic_profile(struct mlx5e_priv *priv)
 +{
 +      mlx5e_netdev_change_profile(priv, &mlx5e_nic_profile, NULL);
 +}
 +
  void mlx5e_destroy_netdev(struct mlx5e_priv *priv)
  {
        struct net_device *netdev = priv->netdev;
@@@ -5829,17 -5809,10 +5862,17 @@@ static int mlx5e_probe(struct auxiliary
  
        priv->profile = profile;
        priv->ppriv = NULL;
 +
 +      err = mlx5e_devlink_port_register(priv);
 +      if (err) {
 +              mlx5_core_err(mdev, "mlx5e_devlink_port_register failed, %d\n", err);
 +              goto err_destroy_netdev;
 +      }
 +
        err = profile->init(mdev, netdev);
        if (err) {
                mlx5_core_err(mdev, "mlx5e_nic_profile init failed, %d\n", err);
 -              goto err_destroy_netdev;
 +              goto err_devlink_cleanup;
        }
  
        err = mlx5e_resume(adev);
        mlx5e_devlink_port_type_eth_set(priv);
  
        mlx5e_dcbnl_init_app(priv);
 +      mlx5_uplink_netdev_set(mdev, netdev);
        return 0;
  
  err_resume:
        mlx5e_suspend(adev, state);
  err_profile_cleanup:
        profile->cleanup(priv);
 +err_devlink_cleanup:
 +      mlx5e_devlink_port_unregister(priv);
  err_destroy_netdev:
        mlx5e_destroy_netdev(priv);
        return err;
@@@ -5880,7 -5850,6 +5913,7 @@@ static void mlx5e_remove(struct auxilia
        unregister_netdev(priv->netdev);
        mlx5e_suspend(adev, state);
        priv->profile->cleanup(priv);
 +      mlx5e_devlink_port_unregister(priv);
        mlx5e_destroy_netdev(priv);
  }
  
@@@ -5906,18 -5875,18 +5939,18 @@@ int mlx5e_init(void
  
        mlx5e_ipsec_build_inverse_table();
        mlx5e_build_ptys2ethtool_map();
 -      ret = mlx5e_rep_init();
 +      ret = auxiliary_driver_register(&mlx5e_driver);
        if (ret)
                return ret;
  
 -      ret = auxiliary_driver_register(&mlx5e_driver);
 +      ret = mlx5e_rep_init();
        if (ret)
 -              mlx5e_rep_cleanup();
 +              auxiliary_driver_unregister(&mlx5e_driver);
        return ret;
  }
  
  void mlx5e_cleanup(void)
  {
 -      auxiliary_driver_unregister(&mlx5e_driver);
        mlx5e_rep_cleanup();
 +      auxiliary_driver_unregister(&mlx5e_driver);
  }
@@@ -52,7 -52,6 +52,7 @@@
  #include "en/health.h"
  #include "en/params.h"
  #include "devlink.h"
 +#include "en/devlink.h"
  
  static struct sk_buff *
  mlx5e_skb_from_cqe_mpwrq_linear(struct mlx5e_rq *rq, struct mlx5e_mpw_info *wi,
@@@ -501,7 -500,6 +501,6 @@@ static int mlx5e_alloc_rx_mpwqe(struct 
        struct mlx5e_icosq *sq = rq->icosq;
        struct mlx5_wq_cyc *wq = &sq->wq;
        struct mlx5e_umr_wqe *umr_wqe;
-       u16 xlt_offset = ix << (MLX5E_LOG_ALIGNED_MPWQE_PPW - 1);
        u16 pi;
        int err;
        int i;
        umr_wqe->ctrl.opmod_idx_opcode =
                cpu_to_be32((sq->pc << MLX5_WQE_CTRL_WQE_INDEX_SHIFT) |
                            MLX5_OPCODE_UMR);
-       umr_wqe->uctrl.xlt_offset = cpu_to_be16(xlt_offset);
+       umr_wqe->uctrl.xlt_offset =
+               cpu_to_be16(MLX5_ALIGNED_MTTS_OCTW(MLX5E_REQUIRED_MTTS(ix)));
  
        sq->db.wqe_info[pi] = (struct mlx5e_icosq_wqe_info) {
                .wqe_type   = MLX5E_ICOSQ_WQE_UMR_RX,
@@@ -670,7 -669,6 +670,7 @@@ int mlx5e_poll_ico_cq(struct mlx5e_cq *
                                                 get_cqe_opcode(cqe));
                                mlx5e_dump_error_cqe(&sq->cq, sq->sqn,
                                                     (struct mlx5_err_cqe *)cqe);
 +                              mlx5_wq_cyc_wqe_dump(&sq->wq, ci, wi->num_wqebbs);
                                if (!test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
                                        queue_work(cq->priv->wq, &sq->recover_work);
                                break;
@@@ -1824,7 -1822,6 +1824,7 @@@ static void mlx5e_trap_handle_rx_cqe(st
        struct mlx5e_priv *priv = netdev_priv(rq->netdev);
        struct mlx5_wq_cyc *wq = &rq->wqe.wq;
        struct mlx5e_wqe_frag_info *wi;
 +      struct devlink_port *dl_port;
        struct sk_buff *skb;
        u32 cqe_bcnt;
        u16 trap_id;
        mlx5e_complete_rx_cqe(rq, cqe, cqe_bcnt, skb);
        skb_push(skb, ETH_HLEN);
  
 -      mlx5_devlink_trap_report(rq->mdev, trap_id, skb, &priv->dl_port);
 +      dl_port = mlx5e_devlink_get_dl_port(priv);
 +      mlx5_devlink_trap_report(rq->mdev, trap_id, skb, dl_port);
        dev_kfree_skb_any(skb);
  
  free_wqe:
@@@ -445,16 -445,12 +445,16 @@@ static void mlx5e_hairpin_destroy_trans
        mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
  }
  
 -static void mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
 +static int mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
  {
 -      u32 indirection_rqt[MLX5E_INDIR_RQT_SIZE], rqn;
 +      u32 *indirection_rqt, rqn;
        struct mlx5e_priv *priv = hp->func_priv;
        int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
  
 +      indirection_rqt = kzalloc(sz, GFP_KERNEL);
 +      if (!indirection_rqt)
 +              return -ENOMEM;
 +
        mlx5e_build_default_indir_rqt(indirection_rqt, sz,
                                      hp->num_channels);
  
                rqn = hp->pair->rqn[ix];
                MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
        }
 +
 +      kfree(indirection_rqt);
 +      return 0;
  }
  
  static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
        MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
        MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
  
 -      mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
 +      err = mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
 +      if (err)
 +              goto out;
  
        err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
        if (!err)
                hp->indir_rqt.enabled = true;
  
 +out:
        kvfree(in);
        return err;
  }
@@@ -1087,23 -1077,19 +1087,23 @@@ mlx5e_tc_offload_fdb_rules(struct mlx5_
        if (flow_flag_test(flow, CT)) {
                mod_hdr_acts = &attr->parse_attr->mod_hdr_acts;
  
 -              return mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
 +              rule = mlx5_tc_ct_flow_offload(get_ct_priv(flow->priv),
                                               flow, spec, attr,
                                               mod_hdr_acts);
 +      } else {
 +              rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
        }
  
 -      rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
        if (IS_ERR(rule))
                return rule;
  
        if (attr->esw_attr->split_count) {
                flow->rule[1] = mlx5_eswitch_add_fwd_rule(esw, spec, attr);
                if (IS_ERR(flow->rule[1])) {
 -                      mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
 +                      if (flow_flag_test(flow, CT))
 +                              mlx5_tc_ct_delete_flow(get_ct_priv(flow->priv), flow, attr);
 +                      else
 +                              mlx5_eswitch_del_offloaded_rule(esw, rule, attr);
                        return flow->rule[1];
                }
        }
@@@ -1961,10 -1947,6 +1961,10 @@@ static int __parse_cls_flower(struct ml
                                    misc_parameters);
        void *misc_v = MLX5_ADDR_OF(fte_match_param, spec->match_value,
                                    misc_parameters);
 +      void *misc_c_3 = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
 +                                  misc_parameters_3);
 +      void *misc_v_3 = MLX5_ADDR_OF(fte_match_param, spec->match_value,
 +                                  misc_parameters_3);
        struct flow_rule *rule = flow_cls_offload_flow_rule(f);
        struct flow_dissector *dissector = rule->match.dissector;
        u16 addr_type = 0;
              BIT(FLOW_DISSECTOR_KEY_CT) |
              BIT(FLOW_DISSECTOR_KEY_ENC_IP) |
              BIT(FLOW_DISSECTOR_KEY_ENC_OPTS) |
 +            BIT(FLOW_DISSECTOR_KEY_ICMP) |
              BIT(FLOW_DISSECTOR_KEY_MPLS))) {
                NL_SET_ERR_MSG_MOD(extack, "Unsupported key");
                netdev_dbg(priv->netdev, "Unsupported key used: 0x%x\n",
                if (match.mask->flags)
                        *match_level = MLX5_MATCH_L4;
        }
 +      if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_ICMP)) {
 +              struct flow_match_icmp match;
  
 +              flow_rule_match_icmp(rule, &match);
 +              switch (ip_proto) {
 +              case IPPROTO_ICMP:
 +                      if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
 +                            MLX5_FLEX_PROTO_ICMP))
 +                              return -EOPNOTSUPP;
 +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_type,
 +                               match.mask->type);
 +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_type,
 +                               match.key->type);
 +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmp_code,
 +                               match.mask->code);
 +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmp_code,
 +                               match.key->code);
 +                      break;
 +              case IPPROTO_ICMPV6:
 +                      if (!(MLX5_CAP_GEN(priv->mdev, flex_parser_protocols) &
 +                            MLX5_FLEX_PROTO_ICMPV6))
 +                              return -EOPNOTSUPP;
 +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_type,
 +                               match.mask->type);
 +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_type,
 +                               match.key->type);
 +                      MLX5_SET(fte_match_set_misc3, misc_c_3, icmpv6_code,
 +                               match.mask->code);
 +                      MLX5_SET(fte_match_set_misc3, misc_v_3, icmpv6_code,
 +                               match.key->code);
 +                      break;
 +              default:
 +                      NL_SET_ERR_MSG_MOD(extack,
 +                                         "Code and type matching only with ICMP and ICMPv6");
 +                      netdev_err(priv->netdev,
 +                                 "Code and type matching only with ICMP and ICMPv6\n");
 +                      return -EINVAL;
 +              }
 +              if (match.mask->code || match.mask->type) {
 +                      *match_level = MLX5_MATCH_L4;
 +                      spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
 +              }
 +      }
+       /* Currenlty supported only for MPLS over UDP */
+       if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
+           !netif_is_bareudp(filter_dev)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "Matching on MPLS is supported only for MPLS over UDP");
+               netdev_err(priv->netdev,
+                          "Matching on MPLS is supported only for MPLS over UDP\n");
+               return -EOPNOTSUPP;
+       }
        return 0;
  }
  
@@@ -2960,6 -2909,37 +2970,37 @@@ static int is_action_keys_supported(con
        return 0;
  }
  
+ static bool modify_tuple_supported(bool modify_tuple, bool ct_clear,
+                                  bool ct_flow, struct netlink_ext_ack *extack,
+                                  struct mlx5e_priv *priv,
+                                  struct mlx5_flow_spec *spec)
+ {
+       if (!modify_tuple || ct_clear)
+               return true;
+       if (ct_flow) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with non-clear ct()");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with non-clear ct()");
+               return false;
+       }
+       /* Add ct_state=-trk match so it will be offloaded for non ct flows
+        * (or after clear action), as otherwise, since the tuple is changed,
+        * we can't restore ct state
+        */
+       if (mlx5_tc_ct_add_no_trk_match(spec)) {
+               NL_SET_ERR_MSG_MOD(extack,
+                                  "can't offload tuple modification with ct matches and no ct(clear) action");
+               netdev_info(priv->netdev,
+                           "can't offload tuple modification with ct matches and no ct(clear) action");
+               return false;
+       }
+       return true;
+ }
  static bool modify_header_match_supported(struct mlx5e_priv *priv,
                                          struct mlx5_flow_spec *spec,
                                          struct flow_action *flow_action,
                        return err;
        }
  
-       /* Add ct_state=-trk match so it will be offloaded for non ct flows
-        * (or after clear action), as otherwise, since the tuple is changed,
-        *  we can't restore ct state
-        */
-       if (!ct_clear && modify_tuple &&
-           mlx5_tc_ct_add_no_trk_match(spec)) {
-               NL_SET_ERR_MSG_MOD(extack,
-                                  "can't offload tuple modify header with ct matches");
-               netdev_info(priv->netdev,
-                           "can't offload tuple modify header with ct matches");
+       if (!modify_tuple_supported(modify_tuple, ct_clear, ct_flow, extack,
+                                   priv, spec))
                return false;
-       }
  
        ip_proto = MLX5_GET(fte_match_set_lyr_2_4, headers_v, ip_protocol);
        if (modify_ip_header && ip_proto != IPPROTO_TCP &&
@@@ -3040,8 -3011,7 +3072,8 @@@ static bool actions_match_supported(str
        actions = flow->attr->action;
  
        if (mlx5e_is_eswitch_flow(flow)) {
 -              if (flow->attr->esw_attr->split_count && ct_flow) {
 +              if (flow->attr->esw_attr->split_count && ct_flow &&
 +                  !MLX5_CAP_GEN(flow->attr->esw_attr->in_mdev, reg_c_preserve)) {
                        /* All registers used by ct are cleared when using
                         * split rules.
                         */
@@@ -3841,7 -3811,6 +3873,7 @@@ static int parse_tc_fdb_actions(struct 
                                return err;
  
                        flow_flag_set(flow, CT);
 +                      esw_attr->split_count = esw_attr->out_count;
                        break;
                default:
                        NL_SET_ERR_MSG_MOD(extack, "The offload action is not supported");
                        return -EOPNOTSUPP;
                }
  
 -              if (attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) {
 -                      NL_SET_ERR_MSG_MOD(extack,
 -                                         "Mirroring goto chain rules isn't supported");
 -                      return -EOPNOTSUPP;
 -              }
                attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
        }
  
@@@ -4323,11 -4297,6 +4355,11 @@@ int mlx5e_configure_flower(struct net_d
        struct mlx5e_tc_flow *flow;
        int err = 0;
  
 +      if (!mlx5_esw_hold(priv->mdev))
 +              return -EAGAIN;
 +
 +      mlx5_esw_get(priv->mdev);
 +
        rcu_read_lock();
        flow = rhashtable_lookup(tc_ht, &f->cookie, tc_ht_params);
        if (flow) {
@@@ -4365,14 -4334,11 +4397,14 @@@ rcu_unlock
        if (err)
                goto err_free;
  
 +      mlx5_esw_release(priv->mdev);
        return 0;
  
  err_free:
        mlx5e_flow_put(priv, flow);
  out:
 +      mlx5_esw_put(priv->mdev);
 +      mlx5_esw_release(priv->mdev);
        return err;
  }
  
@@@ -4412,7 -4378,6 +4444,7 @@@ int mlx5e_delete_flower(struct net_devi
        trace_mlx5e_delete_flower(f);
        mlx5e_flow_put(priv, flow);
  
 +      mlx5_esw_put(priv->mdev);
        return 0;
  
  errout:
@@@ -4512,7 -4477,8 +4544,8 @@@ static int apply_police_params(struct m
         */
        if (rate) {
                rate = (rate * BITS_PER_BYTE) + 500000;
-               rate_mbps = max_t(u64, do_div(rate, 1000000), 1);
+               do_div(rate, 1000000);
+               rate_mbps = max_t(u32, rate, 1);
        }
  
        err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
@@@ -4547,10 -4513,6 +4580,10 @@@ static int scan_tc_matchall_fdb_actions
        flow_action_for_each(i, act, flow_action) {
                switch (act->id) {
                case FLOW_ACTION_POLICE:
 +                      if (act->police.rate_pkt_ps) {
 +                              NL_SET_ERR_MSG_MOD(extack, "QoS offload not support packets per second");
 +                              return -EOPNOTSUPP;
 +                      }
                        err = apply_police_params(priv, act->police.rate_bytes_ps, extack);
                        if (err)
                                return err;
@@@ -4717,6 -4679,10 +4750,6 @@@ int mlx5e_tc_nic_init(struct mlx5e_pri
  
        tc->ct = mlx5_tc_ct_init(priv, tc->chains, &priv->fs.tc.mod_hdr,
                                 MLX5_FLOW_NAMESPACE_KERNEL);
 -      if (IS_ERR(tc->ct)) {
 -              err = PTR_ERR(tc->ct);
 -              goto err_ct;
 -      }
  
        tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event;
        err = register_netdevice_notifier_dev_net(priv->netdev,
  
  err_reg:
        mlx5_tc_ct_clean(tc->ct);
 -err_ct:
        mlx5_chains_destroy(tc->chains);
  err_chains:
        rhashtable_destroy(&tc->ht);
@@@ -4790,6 -4757,8 +4823,6 @@@ int mlx5e_tc_esw_init(struct rhashtabl
                                               esw_chains(esw),
                                               &esw->offloads.mod_hdr,
                                               MLX5_FLOW_NAMESPACE_FDB);
 -      if (IS_ERR(uplink_priv->ct_priv))
 -              goto err_ct;
  
        mapping = mapping_create(sizeof(struct tunnel_match_key),
                                 TUNNEL_INFO_BITS_MASK, true);
@@@ -4829,6 -4798,7 +4862,6 @@@ err_enc_opts_mapping
        mapping_destroy(uplink_priv->tunnel_mapping);
  err_tun_mapping:
        mlx5_tc_ct_clean(uplink_priv->ct_priv);
 -err_ct:
        netdev_warn(priv->netdev,
                    "Failed to initialize tc (eswitch), err: %d", err);
        return err;
@@@ -4901,17 -4871,9 +4934,17 @@@ static int mlx5e_setup_tc_cls_flower(st
  int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
                            void *cb_priv)
  {
 -      unsigned long flags = MLX5_TC_FLAG(INGRESS) | MLX5_TC_FLAG(NIC_OFFLOAD);
 +      unsigned long flags = MLX5_TC_FLAG(INGRESS);
        struct mlx5e_priv *priv = cb_priv;
  
 +      if (!priv->netdev || !netif_device_present(priv->netdev))
 +              return -EOPNOTSUPP;
 +
 +      if (mlx5e_is_uplink_rep(priv))
 +              flags |= MLX5_TC_FLAG(ESW_OFFLOAD);
 +      else
 +              flags |= MLX5_TC_FLAG(NIC_OFFLOAD);
 +
        switch (type) {
        case TC_SETUP_CLSFLOWER:
                return mlx5e_setup_tc_cls_flower(priv, type_data, flags);
@@@ -40,6 -40,7 +40,6 @@@
  #include "eswitch.h"
  #include "esw/indir_table.h"
  #include "esw/acl/ofld.h"
 -#include "esw/indir_table.h"
  #include "rdma.h"
  #include "en.h"
  #include "fs_core.h"
@@@ -550,7 -551,8 +550,8 @@@ esw_setup_dests(struct mlx5_flow_destin
  
        if (!mlx5_eswitch_termtbl_required(esw, attr, flow_act, spec) &&
            MLX5_CAP_GEN(esw_attr->in_mdev, reg_c_preserve) &&
-           mlx5_eswitch_vport_match_metadata_enabled(esw))
+           mlx5_eswitch_vport_match_metadata_enabled(esw) &&
+           MLX5_CAP_ESW_FLOWTABLE_FDB(esw->dev, ignore_flow_level))
                attr->flags |= MLX5_ESW_ATTR_FLAG_SRC_REWRITE;
  
        if (attr->dest_ft) {
@@@ -1445,7 -1447,7 +1446,7 @@@ esw_add_restore_rule(struct mlx5_eswitc
        if (!mlx5_eswitch_reg_c1_loopback_supported(esw))
                return ERR_PTR(-EOPNOTSUPP);
  
 -      spec = kzalloc(sizeof(*spec), GFP_KERNEL);
 +      spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
        if (!spec)
                return ERR_PTR(-ENOMEM);
  
        dest.ft = esw->offloads.ft_offloads;
  
        flow_rule = mlx5_add_flow_rules(ft, spec, &flow_act, &dest, 1);
 -      kfree(spec);
 +      kvfree(spec);
  
        if (IS_ERR(flow_rule))
                esw_warn(esw->dev,
@@@ -1853,7 -1855,6 +1854,7 @@@ static void esw_destroy_offloads_fdb_ta
        /* Holds true only as long as DMFS is the default */
        mlx5_flow_namespace_set_mode(esw->fdb_table.offloads.ns,
                                     MLX5_FLOW_STEERING_MODE_DMFS);
 +      atomic64_set(&esw->user_count, 0);
  }
  
  static int esw_create_offloads_table(struct mlx5_eswitch *esw)
@@@ -2259,11 -2260,9 +2260,11 @@@ int esw_offloads_load_rep(struct mlx5_e
        if (esw->mode != MLX5_ESWITCH_OFFLOADS)
                return 0;
  
 -      err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
 -      if (err)
 -              return err;
 +      if (vport_num != MLX5_VPORT_UPLINK) {
 +              err = mlx5_esw_offloads_devlink_port_register(esw, vport_num);
 +              if (err)
 +                      return err;
 +      }
  
        err = mlx5_esw_offloads_rep_load(esw, vport_num);
        if (err)
        return err;
  
  load_err:
 -      mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
 +      if (vport_num != MLX5_VPORT_UPLINK)
 +              mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
        return err;
  }
  
@@@ -2282,9 -2280,7 +2283,9 @@@ void esw_offloads_unload_rep(struct mlx
                return;
  
        mlx5_esw_offloads_rep_unload(esw, vport_num);
 -      mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
 +
 +      if (vport_num != MLX5_VPORT_UPLINK)
 +              mlx5_esw_offloads_devlink_port_unregister(esw, vport_num);
  }
  
  #define ESW_OFFLOADS_DEVCOM_PAIR      (0)
@@@ -2559,9 -2555,6 +2560,9 @@@ static int esw_create_uplink_offloads_a
        struct mlx5_vport *vport;
  
        vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
 +      if (IS_ERR(vport))
 +              return PTR_ERR(vport);
 +
        return esw_vport_create_offloads_acl_tables(esw, vport);
  }
  
@@@ -2570,9 -2563,6 +2571,9 @@@ static void esw_destroy_uplink_offloads
        struct mlx5_vport *vport;
  
        vport = mlx5_eswitch_get_vport(esw, MLX5_VPORT_UPLINK);
 +      if (IS_ERR(vport))
 +              return;
 +
        esw_vport_destroy_offloads_acl_tables(esw, vport);
  }
  
@@@ -2584,7 -2574,6 +2585,7 @@@ static int esw_offloads_steering_init(s
        memset(&esw->fdb_table.offloads, 0, sizeof(struct offloads_fdb));
        mutex_init(&esw->fdb_table.offloads.vports.lock);
        hash_init(esw->fdb_table.offloads.vports.table);
 +      atomic64_set(&esw->user_count, 0);
  
        indir = mlx5_esw_indir_table_init();
        if (IS_ERR(indir)) {
@@@ -2926,14 -2915,8 +2927,14 @@@ int mlx5_devlink_eswitch_mode_set(struc
        if (esw_mode_from_devlink(mode, &mlx5_mode))
                return -EINVAL;
  
 -      mutex_lock(&esw->mode_lock);
 -      cur_mlx5_mode = esw->mode;
 +      err = mlx5_esw_try_lock(esw);
 +      if (err < 0) {
 +              NL_SET_ERR_MSG_MOD(extack, "Can't change mode, E-Switch is busy");
 +              return err;
 +      }
 +      cur_mlx5_mode = err;
 +      err = 0;
 +
        if (cur_mlx5_mode == mlx5_mode)
                goto unlock;
  
                err = -EINVAL;
  
  unlock:
 -      mutex_unlock(&esw->mode_lock);
 +      mlx5_esw_unlock(esw);
        return err;
  }
  
@@@ -2958,14 -2941,14 +2959,14 @@@ int mlx5_devlink_eswitch_mode_get(struc
        if (IS_ERR(esw))
                return PTR_ERR(esw);
  
 -      mutex_lock(&esw->mode_lock);
 +      down_write(&esw->mode_lock);
        err = eswitch_devlink_esw_mode_check(esw);
        if (err)
                goto unlock;
  
        err = esw_mode_to_devlink(esw->mode, mode);
  unlock:
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return err;
  }
  
@@@ -2981,7 -2964,7 +2982,7 @@@ int mlx5_devlink_eswitch_inline_mode_se
        if (IS_ERR(esw))
                return PTR_ERR(esw);
  
 -      mutex_lock(&esw->mode_lock);
 +      down_write(&esw->mode_lock);
        err = eswitch_devlink_esw_mode_check(esw);
        if (err)
                goto out;
        }
  
        esw->offloads.inline_mode = mlx5_mode;
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return 0;
  
  revert_inline_mode:
                                                 vport,
                                                 esw->offloads.inline_mode);
  out:
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return err;
  }
  
@@@ -3043,14 -3026,14 +3044,14 @@@ int mlx5_devlink_eswitch_inline_mode_ge
        if (IS_ERR(esw))
                return PTR_ERR(esw);
  
 -      mutex_lock(&esw->mode_lock);
 +      down_write(&esw->mode_lock);
        err = eswitch_devlink_esw_mode_check(esw);
        if (err)
                goto unlock;
  
        err = esw_inline_mode_to_devlink(esw->offloads.inline_mode, mode);
  unlock:
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return err;
  }
  
@@@ -3066,7 -3049,7 +3067,7 @@@ int mlx5_devlink_eswitch_encap_mode_set
        if (IS_ERR(esw))
                return PTR_ERR(esw);
  
 -      mutex_lock(&esw->mode_lock);
 +      down_write(&esw->mode_lock);
        err = eswitch_devlink_esw_mode_check(esw);
        if (err)
                goto unlock;
        }
  
  unlock:
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return err;
  }
  
@@@ -3127,14 -3110,14 +3128,14 @@@ int mlx5_devlink_eswitch_encap_mode_get
                return PTR_ERR(esw);
  
  
 -      mutex_lock(&esw->mode_lock);
 +      down_write(&esw->mode_lock);
        err = eswitch_devlink_esw_mode_check(esw);
        if (err)
                goto unlock;
  
        *encap = esw->offloads.encap;
  unlock:
 -      mutex_unlock(&esw->mode_lock);
 +      up_write(&esw->mode_lock);
        return 0;
  }
  
@@@ -233,6 -233,7 +233,7 @@@ int mlx5i_create_underlay_qp(struct mlx
        }
  
        qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
+       MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(priv->mdev));
        MLX5_SET(qpc, qpc, st, MLX5_QP_ST_UD);
        MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
        MLX5_SET(qpc, qpc, ulp_stateless_offload_mode,
@@@ -694,6 -695,7 +695,7 @@@ static int mlx5i_check_required_hca_cap
  static void mlx5_rdma_netdev_free(struct net_device *netdev)
  {
        struct mlx5e_priv *priv = mlx5i_epriv(netdev);
+       struct mlx5_core_dev *mdev = priv->mdev;
        struct mlx5i_priv *ipriv = priv->ppriv;
        const struct mlx5e_profile *profile = priv->profile;
  
  
        if (!ipriv->sub_interface) {
                mlx5i_pkey_qpn_ht_cleanup(netdev);
-               mlx5e_destroy_mdev_resources(priv->mdev);
+               mlx5e_destroy_mdev_resources(mdev);
        }
  }
  
  static bool mlx5_is_sub_interface(struct mlx5_core_dev *mdev)
  {
 -      return mdev->mlx5e_res.pdn != 0;
 +      return mdev->mlx5e_res.hw_objs.pdn != 0;
  }
  
  static const struct mlx5e_profile *mlx5_get_profile(struct mlx5_core_dev *mdev)
@@@ -5,7 -5,8 +5,9 @@@
  #include "priv.h"
  #include "sf.h"
  #include "mlx5_ifc_vhca_event.h"
 +#include "ecpf.h"
+ #include "vhca_event.h"
+ #include "mlx5_core.h"
  
  struct mlx5_sf_hw {
        u32 usr_sfnum;
@@@ -17,7 -18,6 +19,6 @@@ struct mlx5_sf_hw_table 
        struct mlx5_core_dev *dev;
        struct mlx5_sf_hw *sfs;
        int max_local_functions;
-       u8 ecpu: 1;
        struct mutex table_lock; /* Serializes sf deletion and vhca state change handler. */
        struct notifier_block vhca_nb;
  };
@@@ -63,7 -63,7 +64,7 @@@ int mlx5_sf_hw_table_sf_alloc(struct ml
        }
        if (sw_id == -ENOSPC) {
                err = -ENOSPC;
-               goto err;
+               goto exist_err;
        }
  
        hw_fn_id = mlx5_sf_sw_to_hw_id(table->dev, sw_id);
@@@ -71,7 -71,7 +72,7 @@@
        if (err)
                goto err;
  
-       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, table->ecpu, usr_sfnum);
+       err = mlx5_modify_vhca_sw_id(dev, hw_fn_id, usr_sfnum);
        if (err)
                goto vhca_err;
  
@@@ -117,7 -117,7 +118,7 @@@ void mlx5_sf_hw_table_sf_deferred_free(
  
        hw_fn_id = mlx5_sf_sw_to_hw_id(dev, id);
        mutex_lock(&table->table_lock);
-       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, table->ecpu, out, sizeof(out));
+       err = mlx5_cmd_query_vhca_state(dev, hw_fn_id, out, sizeof(out));
        if (err)
                goto err;
        state = MLX5_GET(query_vhca_state_out, out, vhca_state_context.vhca_state);
@@@ -163,7 -163,6 +164,6 @@@ int mlx5_sf_hw_table_init(struct mlx5_c
        table->dev = dev;
        table->sfs = sfs;
        table->max_local_functions = max_functions;
-       table->ecpu = mlx5_read_embedded_cpu(dev);
        dev->priv.sf_hw_table = table;
        mlx5_core_dbg(dev, "SF HW table: max sfs = %d\n", max_functions);
        return 0;
@@@ -264,8 -264,8 +264,8 @@@ static void dr_ste_v1_set_miss_addr(u8 
  static u64 dr_ste_v1_get_miss_addr(u8 *hw_ste_p)
  {
        u64 index =
-               (MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
-                MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32) << 26);
+               ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_31_6) |
+                ((u64)MLX5_GET(ste_match_bwc_v1, hw_ste_p, miss_address_39_32)) << 26);
  
        return index << 6;
  }
@@@ -437,6 -437,21 +437,6 @@@ static void dr_ste_v1_set_rx_decap(u8 *
        dr_ste_v1_set_reparse(hw_ste_p);
  }
  
 -static void dr_ste_v1_set_rx_decap_l3(u8 *hw_ste_p,
 -                                    u8 *s_action,
 -                                    u16 decap_actions,
 -                                    u32 decap_index)
 -{
 -      MLX5_SET(ste_single_action_modify_list_v1, s_action, action_id,
 -               DR_STE_V1_ACTION_ID_MODIFY_LIST);
 -      MLX5_SET(ste_single_action_modify_list_v1, s_action, num_of_modify_actions,
 -               decap_actions);
 -      MLX5_SET(ste_single_action_modify_list_v1, s_action, modify_actions_ptr,
 -               decap_index);
 -
 -      dr_ste_v1_set_reparse(hw_ste_p);
 -}
 -
  static void dr_ste_v1_set_rewrite_actions(u8 *hw_ste_p,
                                          u8 *s_action,
                                          u16 num_of_actions,
@@@ -556,6 -571,9 +556,6 @@@ static void dr_ste_v1_set_actions_rx(st
        bool allow_ctr = true;
  
        if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) {
 -              dr_ste_v1_set_rx_decap_l3(last_ste, action,
 -                                        attr->decap_actions,
 -                                        attr->decap_index);
                dr_ste_v1_set_rewrite_actions(last_ste, action,
                                              attr->decap_actions,
                                              attr->decap_index);
@@@ -1514,7 -1532,6 +1514,7 @@@ static void dr_ste_v1_build_src_gvmi_qp
  
        DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_gvmi, misc_mask, source_port);
        DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_qp, misc_mask, source_sqn);
 +      misc_mask->source_eswitch_owner_vhca_id = 0;
  }
  
  static int dr_ste_v1_build_src_gvmi_qpn_tag(struct mlx5dr_match_param *value,
  #include "ionic_lif.h"
  #include "ionic_txrx.h"
  
 -static void ionic_rx_clean(struct ionic_queue *q,
 -                         struct ionic_desc_info *desc_info,
 -                         struct ionic_cq_info *cq_info,
 -                         void *cb_arg);
 -
 -static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
  
  static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info);
  
@@@ -34,149 -40,72 +34,149 @@@ static inline struct netdev_queue *q_to
        return netdev_get_tx_queue(q->lif->netdev, q->index);
  }
  
 -static struct sk_buff *ionic_rx_skb_alloc(struct ionic_queue *q,
 -                                        unsigned int len, bool frags)
 +static void ionic_rx_buf_reset(struct ionic_buf_info *buf_info)
 +{
 +      buf_info->page = NULL;
 +      buf_info->page_offset = 0;
 +      buf_info->dma_addr = 0;
 +}
 +
 +static int ionic_rx_page_alloc(struct ionic_queue *q,
 +                             struct ionic_buf_info *buf_info)
  {
 -      struct ionic_lif *lif = q->lif;
 +      struct net_device *netdev = q->lif->netdev;
        struct ionic_rx_stats *stats;
 -      struct net_device *netdev;
 -      struct sk_buff *skb;
 +      struct device *dev;
  
 -      netdev = lif->netdev;
 -      stats = &q->lif->rxqstats[q->index];
 +      dev = q->dev;
 +      stats = q_to_rx_stats(q);
  
 -      if (frags)
 -              skb = napi_get_frags(&q_to_qcq(q)->napi);
 -      else
 -              skb = netdev_alloc_skb_ip_align(netdev, len);
 +      if (unlikely(!buf_info)) {
 +              net_err_ratelimited("%s: %s invalid buf_info in alloc\n",
 +                                  netdev->name, q->name);
 +              return -EINVAL;
 +      }
  
 -      if (unlikely(!skb)) {
 -              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
 -                                   netdev->name, q->name);
 +      buf_info->page = alloc_pages(IONIC_PAGE_GFP_MASK, 0);
 +      if (unlikely(!buf_info->page)) {
 +              net_err_ratelimited("%s: %s page alloc failed\n",
 +                                  netdev->name, q->name);
                stats->alloc_err++;
 -              return NULL;
 +              return -ENOMEM;
        }
 +      buf_info->page_offset = 0;
  
 -      return skb;
 +      buf_info->dma_addr = dma_map_page(dev, buf_info->page, buf_info->page_offset,
 +                                        IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
 +      if (unlikely(dma_mapping_error(dev, buf_info->dma_addr))) {
 +              __free_pages(buf_info->page, 0);
 +              ionic_rx_buf_reset(buf_info);
 +              net_err_ratelimited("%s: %s dma map failed\n",
 +                                  netdev->name, q->name);
 +              stats->dma_map_err++;
 +              return -EIO;
 +      }
 +
 +      return 0;
 +}
 +
 +static void ionic_rx_page_free(struct ionic_queue *q,
 +                             struct ionic_buf_info *buf_info)
 +{
 +      struct net_device *netdev = q->lif->netdev;
 +      struct device *dev = q->dev;
 +
 +      if (unlikely(!buf_info)) {
 +              net_err_ratelimited("%s: %s invalid buf_info in free\n",
 +                                  netdev->name, q->name);
 +              return;
 +      }
 +
 +      if (!buf_info->page)
 +              return;
 +
 +      dma_unmap_page(dev, buf_info->dma_addr, IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
 +      __free_pages(buf_info->page, 0);
 +      ionic_rx_buf_reset(buf_info);
 +}
 +
 +static bool ionic_rx_buf_recycle(struct ionic_queue *q,
 +                               struct ionic_buf_info *buf_info, u32 used)
 +{
 +      u32 size;
 +
 +      /* don't re-use pages allocated in low-mem condition */
 +      if (page_is_pfmemalloc(buf_info->page))
 +              return false;
 +
 +      /* don't re-use buffers from non-local numa nodes */
 +      if (page_to_nid(buf_info->page) != numa_mem_id())
 +              return false;
 +
 +      size = ALIGN(used, IONIC_PAGE_SPLIT_SZ);
 +      buf_info->page_offset += size;
 +      if (buf_info->page_offset >= IONIC_PAGE_SIZE)
 +              return false;
 +
 +      get_page(buf_info->page);
 +
 +      return true;
  }
  
  static struct sk_buff *ionic_rx_frags(struct ionic_queue *q,
                                      struct ionic_desc_info *desc_info,
 -                                    struct ionic_cq_info *cq_info)
 +                                    struct ionic_rxq_comp *comp)
  {
 -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
 -      struct device *dev = q->lif->ionic->dev;
 -      struct ionic_page_info *page_info;
 +      struct net_device *netdev = q->lif->netdev;
 +      struct ionic_buf_info *buf_info;
 +      struct ionic_rx_stats *stats;
 +      struct device *dev = q->dev;
        struct sk_buff *skb;
        unsigned int i;
        u16 frag_len;
        u16 len;
  
 -      page_info = &desc_info->pages[0];
 +      stats = q_to_rx_stats(q);
 +
 +      buf_info = &desc_info->bufs[0];
        len = le16_to_cpu(comp->len);
  
 -      prefetch(page_address(page_info->page) + NET_IP_ALIGN);
 +      prefetch(buf_info->page);
  
 -      skb = ionic_rx_skb_alloc(q, len, true);
 -      if (unlikely(!skb))
 +      skb = napi_get_frags(&q_to_qcq(q)->napi);
 +      if (unlikely(!skb)) {
 +              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
 +                                   netdev->name, q->name);
 +              stats->alloc_err++;
                return NULL;
 +      }
  
        i = comp->num_sg_elems + 1;
        do {
 -              if (unlikely(!page_info->page)) {
 -                      struct napi_struct *napi = &q_to_qcq(q)->napi;
 -
 -                      napi->skb = NULL;
 +              if (unlikely(!buf_info->page)) {
                        dev_kfree_skb(skb);
                        return NULL;
                }
  
 -              frag_len = min(len, (u16)PAGE_SIZE);
 +              frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
                len -= frag_len;
  
 -              dma_unmap_page(dev, dma_unmap_addr(page_info, dma_addr),
 -                             PAGE_SIZE, DMA_FROM_DEVICE);
 +              dma_sync_single_for_cpu(dev,
 +                                      buf_info->dma_addr + buf_info->page_offset,
 +                                      frag_len, DMA_FROM_DEVICE);
 +
                skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags,
 -                              page_info->page, 0, frag_len, PAGE_SIZE);
 -              page_info->page = NULL;
 -              page_info++;
 +                              buf_info->page, buf_info->page_offset, frag_len,
 +                              IONIC_PAGE_SIZE);
 +
 +              if (!ionic_rx_buf_recycle(q, buf_info, frag_len)) {
 +                      dma_unmap_page(dev, buf_info->dma_addr,
 +                                     IONIC_PAGE_SIZE, DMA_FROM_DEVICE);
 +                      ionic_rx_buf_reset(buf_info);
 +              }
 +
 +              buf_info++;
 +
                i--;
        } while (i > 0);
  
  
  static struct sk_buff *ionic_rx_copybreak(struct ionic_queue *q,
                                          struct ionic_desc_info *desc_info,
 -                                        struct ionic_cq_info *cq_info)
 +                                        struct ionic_rxq_comp *comp)
  {
 -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
 -      struct device *dev = q->lif->ionic->dev;
 -      struct ionic_page_info *page_info;
 +      struct net_device *netdev = q->lif->netdev;
 +      struct ionic_buf_info *buf_info;
 +      struct ionic_rx_stats *stats;
 +      struct device *dev = q->dev;
        struct sk_buff *skb;
        u16 len;
  
 -      page_info = &desc_info->pages[0];
 +      stats = q_to_rx_stats(q);
 +
 +      buf_info = &desc_info->bufs[0];
        len = le16_to_cpu(comp->len);
  
 -      skb = ionic_rx_skb_alloc(q, len, false);
 -      if (unlikely(!skb))
 +      skb = napi_alloc_skb(&q_to_qcq(q)->napi, len);
 +      if (unlikely(!skb)) {
 +              net_warn_ratelimited("%s: SKB alloc failed on %s!\n",
 +                                   netdev->name, q->name);
 +              stats->alloc_err++;
                return NULL;
 +      }
  
 -      if (unlikely(!page_info->page)) {
 +      if (unlikely(!buf_info->page)) {
                dev_kfree_skb(skb);
                return NULL;
        }
  
 -      dma_sync_single_for_cpu(dev, dma_unmap_addr(page_info, dma_addr),
 +      dma_sync_single_for_cpu(dev, buf_info->dma_addr + buf_info->page_offset,
                                len, DMA_FROM_DEVICE);
 -      skb_copy_to_linear_data(skb, page_address(page_info->page), len);
 -      dma_sync_single_for_device(dev, dma_unmap_addr(page_info, dma_addr),
 +      skb_copy_to_linear_data(skb, page_address(buf_info->page) + buf_info->page_offset, len);
 +      dma_sync_single_for_device(dev, buf_info->dma_addr + buf_info->page_offset,
                                   len, DMA_FROM_DEVICE);
  
        skb_put(skb, len);
@@@ -229,13 -151,14 +229,13 @@@ static void ionic_rx_clean(struct ionic
                           struct ionic_cq_info *cq_info,
                           void *cb_arg)
  {
 -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
 +      struct ionic_rxq_comp *comp = cq_info->rxcq;
 +      struct net_device *netdev = q->lif->netdev;
        struct ionic_qcq *qcq = q_to_qcq(q);
        struct ionic_rx_stats *stats;
 -      struct net_device *netdev;
        struct sk_buff *skb;
  
        stats = q_to_rx_stats(q);
 -      netdev = q->lif->netdev;
  
        if (comp->status) {
                stats->dropped++;
        stats->bytes += le16_to_cpu(comp->len);
  
        if (le16_to_cpu(comp->len) <= q->lif->rx_copybreak)
 -              skb = ionic_rx_copybreak(q, desc_info, cq_info);
 +              skb = ionic_rx_copybreak(q, desc_info, comp);
        else
 -              skb = ionic_rx_frags(q, desc_info, cq_info);
 +              skb = ionic_rx_frags(q, desc_info, comp);
  
        if (unlikely(!skb)) {
                stats->dropped++;
  
  static bool ionic_rx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
  {
 -      struct ionic_rxq_comp *comp = cq_info->cq_desc;
 +      struct ionic_rxq_comp *comp = cq_info->rxcq;
        struct ionic_queue *q = cq->bound_q;
        struct ionic_desc_info *desc_info;
  
        return true;
  }
  
 -static int ionic_rx_page_alloc(struct ionic_queue *q,
 -                             struct ionic_page_info *page_info)
 -{
 -      struct ionic_lif *lif = q->lif;
 -      struct ionic_rx_stats *stats;
 -      struct net_device *netdev;
 -      struct device *dev;
 -
 -      netdev = lif->netdev;
 -      dev = lif->ionic->dev;
 -      stats = q_to_rx_stats(q);
 -
 -      if (unlikely(!page_info)) {
 -              net_err_ratelimited("%s: %s invalid page_info in alloc\n",
 -                                  netdev->name, q->name);
 -              return -EINVAL;
 -      }
 -
 -      page_info->page = dev_alloc_page();
 -      if (unlikely(!page_info->page)) {
 -              net_err_ratelimited("%s: %s page alloc failed\n",
 -                                  netdev->name, q->name);
 -              stats->alloc_err++;
 -              return -ENOMEM;
 -      }
 -
 -      page_info->dma_addr = dma_map_page(dev, page_info->page, 0, PAGE_SIZE,
 -                                         DMA_FROM_DEVICE);
 -      if (unlikely(dma_mapping_error(dev, page_info->dma_addr))) {
 -              put_page(page_info->page);
 -              page_info->dma_addr = 0;
 -              page_info->page = NULL;
 -              net_err_ratelimited("%s: %s dma map failed\n",
 -                                  netdev->name, q->name);
 -              stats->dma_map_err++;
 -              return -EIO;
 -      }
 -
 -      return 0;
 -}
 -
 -static void ionic_rx_page_free(struct ionic_queue *q,
 -                             struct ionic_page_info *page_info)
 -{
 -      struct ionic_lif *lif = q->lif;
 -      struct net_device *netdev;
 -      struct device *dev;
 -
 -      netdev = lif->netdev;
 -      dev = lif->ionic->dev;
 -
 -      if (unlikely(!page_info)) {
 -              net_err_ratelimited("%s: %s invalid page_info in free\n",
 -                                  netdev->name, q->name);
 -              return;
 -      }
 -
 -      if (unlikely(!page_info->page)) {
 -              net_err_ratelimited("%s: %s invalid page in free\n",
 -                                  netdev->name, q->name);
 -              return;
 -      }
 -
 -      dma_unmap_page(dev, page_info->dma_addr, PAGE_SIZE, DMA_FROM_DEVICE);
 -
 -      put_page(page_info->page);
 -      page_info->dma_addr = 0;
 -      page_info->page = NULL;
 -}
 -
  void ionic_rx_fill(struct ionic_queue *q)
  {
        struct net_device *netdev = q->lif->netdev;
        struct ionic_desc_info *desc_info;
 -      struct ionic_page_info *page_info;
        struct ionic_rxq_sg_desc *sg_desc;
        struct ionic_rxq_sg_elem *sg_elem;
 +      struct ionic_buf_info *buf_info;
        struct ionic_rxq_desc *desc;
        unsigned int remain_len;
 -      unsigned int seg_len;
 +      unsigned int frag_len;
        unsigned int nfrags;
        unsigned int i, j;
        unsigned int len;
  
        len = netdev->mtu + ETH_HLEN + VLAN_HLEN;
 -      nfrags = round_up(len, PAGE_SIZE) / PAGE_SIZE;
  
        for (i = ionic_q_space_avail(q); i; i--) {
 +              nfrags = 0;
                remain_len = len;
                desc_info = &q->info[q->head_idx];
                desc = desc_info->desc;
 -              sg_desc = desc_info->sg_desc;
 -              page_info = &desc_info->pages[0];
 +              buf_info = &desc_info->bufs[0];
  
 -              if (page_info->page) { /* recycle the buffer */
 -                      ionic_rxq_post(q, false, ionic_rx_clean, NULL);
 -                      continue;
 -              }
 -
 -              /* fill main descriptor - pages[0] */
 -              desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
 -                                            IONIC_RXQ_DESC_OPCODE_SIMPLE;
 -              desc_info->npages = nfrags;
 -              if (unlikely(ionic_rx_page_alloc(q, page_info))) {
 -                      desc->addr = 0;
 -                      desc->len = 0;
 -                      return;
 +              if (!buf_info->page) { /* alloc a new buffer? */
 +                      if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
 +                              desc->addr = 0;
 +                              desc->len = 0;
 +                              return;
 +                      }
                }
 -              desc->addr = cpu_to_le64(page_info->dma_addr);
 -              seg_len = min_t(unsigned int, PAGE_SIZE, len);
 -              desc->len = cpu_to_le16(seg_len);
 -              remain_len -= seg_len;
 -              page_info++;
  
 -              /* fill sg descriptors - pages[1..n] */
 -              for (j = 0; j < nfrags - 1; j++) {
 -                      if (page_info->page) /* recycle the sg buffer */
 -                              continue;
 +              /* fill main descriptor - buf[0] */
 +              desc->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
 +              frag_len = min_t(u16, len, IONIC_PAGE_SIZE - buf_info->page_offset);
 +              desc->len = cpu_to_le16(frag_len);
 +              remain_len -= frag_len;
 +              buf_info++;
 +              nfrags++;
  
 +              /* fill sg descriptors - buf[1..n] */
 +              sg_desc = desc_info->sg_desc;
 +              for (j = 0; remain_len > 0 && j < q->max_sg_elems; j++) {
                        sg_elem = &sg_desc->elems[j];
 -                      if (unlikely(ionic_rx_page_alloc(q, page_info))) {
 -                              sg_elem->addr = 0;
 -                              sg_elem->len = 0;
 -                              return;
 +                      if (!buf_info->page) { /* alloc a new sg buffer? */
 +                              if (unlikely(ionic_rx_page_alloc(q, buf_info))) {
 +                                      sg_elem->addr = 0;
 +                                      sg_elem->len = 0;
 +                                      return;
 +                              }
                        }
 -                      sg_elem->addr = cpu_to_le64(page_info->dma_addr);
 -                      seg_len = min_t(unsigned int, PAGE_SIZE, remain_len);
 -                      sg_elem->len = cpu_to_le16(seg_len);
 -                      remain_len -= seg_len;
 -                      page_info++;
 +
 +                      sg_elem->addr = cpu_to_le64(buf_info->dma_addr + buf_info->page_offset);
 +                      frag_len = min_t(u16, remain_len, IONIC_PAGE_SIZE - buf_info->page_offset);
 +                      sg_elem->len = cpu_to_le16(frag_len);
 +                      remain_len -= frag_len;
 +                      buf_info++;
 +                      nfrags++;
 +              }
 +
 +              /* clear end sg element as a sentinel */
 +              if (j < q->max_sg_elems) {
 +                      sg_elem = &sg_desc->elems[j];
 +                      memset(sg_elem, 0, sizeof(*sg_elem));
                }
  
 +              desc->opcode = (nfrags > 1) ? IONIC_RXQ_DESC_OPCODE_SG :
 +                                            IONIC_RXQ_DESC_OPCODE_SIMPLE;
 +              desc_info->nbufs = nfrags;
 +
                ionic_rxq_post(q, false, ionic_rx_clean, NULL);
        }
  
  void ionic_rx_empty(struct ionic_queue *q)
  {
        struct ionic_desc_info *desc_info;
 -      struct ionic_page_info *page_info;
 +      struct ionic_buf_info *buf_info;
        unsigned int i, j;
  
        for (i = 0; i < q->num_descs; i++) {
                desc_info = &q->info[i];
                for (j = 0; j < IONIC_RX_MAX_SG_ELEMS + 1; j++) {
 -                      page_info = &desc_info->pages[j];
 -                      if (page_info->page)
 -                              ionic_rx_page_free(q, page_info);
 +                      buf_info = &desc_info->bufs[j];
 +                      if (buf_info->page)
 +                              ionic_rx_page_free(q, buf_info);
                }
  
 -              desc_info->npages = 0;
 +              desc_info->nbufs = 0;
                desc_info->cb = NULL;
                desc_info->cb_arg = NULL;
        }
 +
 +      q->head_idx = 0;
 +      q->tail_idx = 0;
  }
  
  static void ionic_dim_update(struct ionic_qcq *qcq)
@@@ -542,7 -525,7 +542,7 @@@ int ionic_txrx_napi(struct napi_struct 
        idev = &lif->ionic->idev;
        txcq = &lif->txqcqs[qi]->cq;
  
 -      tx_work_done = ionic_cq_service(txcq, lif->tx_budget,
 +      tx_work_done = ionic_cq_service(txcq, IONIC_TX_BUDGET_DEFAULT,
                                        ionic_tx_service, NULL, NULL);
  
        rx_work_done = ionic_cq_service(rxcq, budget,
@@@ -575,7 -558,7 +575,7 @@@ static dma_addr_t ionic_tx_map_single(s
                                      void *data, size_t len)
  {
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct device *dev = q->lif->ionic->dev;
 +      struct device *dev = q->dev;
        dma_addr_t dma_addr;
  
        dma_addr = dma_map_single(dev, data, len, DMA_TO_DEVICE);
@@@ -593,7 -576,7 +593,7 @@@ static dma_addr_t ionic_tx_map_frag(str
                                    size_t offset, size_t len)
  {
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct device *dev = q->lif->ionic->dev;
 +      struct device *dev = q->dev;
        dma_addr_t dma_addr;
  
        dma_addr = skb_frag_dma_map(dev, frag, offset, len, DMA_TO_DEVICE);
        return dma_addr;
  }
  
 +static int ionic_tx_map_skb(struct ionic_queue *q, struct sk_buff *skb,
 +                          struct ionic_desc_info *desc_info)
 +{
 +      struct ionic_buf_info *buf_info = desc_info->bufs;
 +      struct device *dev = q->dev;
 +      dma_addr_t dma_addr;
 +      unsigned int nfrags;
 +      skb_frag_t *frag;
 +      int frag_idx;
 +
 +      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
 +      if (dma_mapping_error(dev, dma_addr))
 +              return -EIO;
 +      buf_info->dma_addr = dma_addr;
 +      buf_info->len = skb_headlen(skb);
 +      buf_info++;
 +
 +      frag = skb_shinfo(skb)->frags;
 +      nfrags = skb_shinfo(skb)->nr_frags;
 +      for (frag_idx = 0; frag_idx < nfrags; frag_idx++, frag++) {
 +              dma_addr = ionic_tx_map_frag(q, frag, 0, skb_frag_size(frag));
 +              if (dma_mapping_error(dev, dma_addr))
 +                      goto dma_fail;
 +              buf_info->dma_addr = dma_addr;
 +              buf_info->len = skb_frag_size(frag);
 +              buf_info++;
 +      }
 +
 +      desc_info->nbufs = 1 + nfrags;
 +
 +      return 0;
 +
 +dma_fail:
 +      /* unwind the frag mappings and the head mapping */
 +      while (frag_idx > 0) {
 +              frag_idx--;
 +              buf_info--;
 +              dma_unmap_page(dev, buf_info->dma_addr,
 +                             buf_info->len, DMA_TO_DEVICE);
 +      }
 +      dma_unmap_single(dev, buf_info->dma_addr, buf_info->len, DMA_TO_DEVICE);
 +      return -EIO;
 +}
 +
  static void ionic_tx_clean(struct ionic_queue *q,
                           struct ionic_desc_info *desc_info,
                           struct ionic_cq_info *cq_info,
                           void *cb_arg)
  {
 -      struct ionic_txq_sg_desc *sg_desc = desc_info->sg_desc;
 -      struct ionic_txq_sg_elem *elem = sg_desc->elems;
 +      struct ionic_buf_info *buf_info = desc_info->bufs;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct ionic_txq_desc *desc = desc_info->desc;
 -      struct device *dev = q->lif->ionic->dev;
 -      u8 opcode, flags, nsge;
 +      struct device *dev = q->dev;
        u16 queue_index;
        unsigned int i;
 -      u64 addr;
 -
 -      decode_txq_desc_cmd(le64_to_cpu(desc->cmd),
 -                          &opcode, &flags, &nsge, &addr);
 -
 -      /* use unmap_single only if either this is not TSO,
 -       * or this is first descriptor of a TSO
 -       */
 -      if (opcode != IONIC_TXQ_DESC_OPCODE_TSO ||
 -          flags & IONIC_TXQ_DESC_FLAG_TSO_SOT)
 -              dma_unmap_single(dev, (dma_addr_t)addr,
 -                               le16_to_cpu(desc->len), DMA_TO_DEVICE);
 -      else
 -              dma_unmap_page(dev, (dma_addr_t)addr,
 -                             le16_to_cpu(desc->len), DMA_TO_DEVICE);
  
 -      for (i = 0; i < nsge; i++, elem++)
 -              dma_unmap_page(dev, (dma_addr_t)le64_to_cpu(elem->addr),
 -                             le16_to_cpu(elem->len), DMA_TO_DEVICE);
 +      if (desc_info->nbufs) {
 +              dma_unmap_single(dev, (dma_addr_t)buf_info->dma_addr,
 +                               buf_info->len, DMA_TO_DEVICE);
 +              buf_info++;
 +              for (i = 1; i < desc_info->nbufs; i++, buf_info++)
 +                      dma_unmap_page(dev, (dma_addr_t)buf_info->dma_addr,
 +                                     buf_info->len, DMA_TO_DEVICE);
 +      }
  
        if (cb_arg) {
                struct sk_buff *skb = cb_arg;
 -              u32 len = skb->len;
  
                queue_index = skb_get_queue_mapping(skb);
                if (unlikely(__netif_subqueue_stopped(q->lif->netdev,
                        netif_wake_subqueue(q->lif->netdev, queue_index);
                        q->wake++;
                }
 -              dev_kfree_skb_any(skb);
 +
 +              desc_info->bytes = skb->len;
                stats->clean++;
 -              netdev_tx_completed_queue(q_to_ndq(q), 1, len);
 +
 +              dev_consume_skb_any(skb);
        }
  }
  
  static bool ionic_tx_service(struct ionic_cq *cq, struct ionic_cq_info *cq_info)
  {
 -      struct ionic_txq_comp *comp = cq_info->cq_desc;
 +      struct ionic_txq_comp *comp = cq_info->txcq;
        struct ionic_queue *q = cq->bound_q;
        struct ionic_desc_info *desc_info;
 +      int bytes = 0;
 +      int pkts = 0;
        u16 index;
  
        if (!color_match(comp->color, cq->done_color))
         */
        do {
                desc_info = &q->info[q->tail_idx];
 +              desc_info->bytes = 0;
                index = q->tail_idx;
                q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
                ionic_tx_clean(q, desc_info, cq_info, desc_info->cb_arg);
 +              if (desc_info->cb_arg) {
 +                      pkts++;
 +                      bytes += desc_info->bytes;
 +              }
                desc_info->cb = NULL;
                desc_info->cb_arg = NULL;
        } while (index != le16_to_cpu(comp->comp_index));
  
 +      if (pkts && bytes)
 +              netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
 +
        return true;
  }
  
@@@ -736,25 -677,15 +736,25 @@@ void ionic_tx_flush(struct ionic_cq *cq
  void ionic_tx_empty(struct ionic_queue *q)
  {
        struct ionic_desc_info *desc_info;
 +      int bytes = 0;
 +      int pkts = 0;
  
        /* walk the not completed tx entries, if any */
        while (q->head_idx != q->tail_idx) {
                desc_info = &q->info[q->tail_idx];
 +              desc_info->bytes = 0;
                q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1);
                ionic_tx_clean(q, desc_info, NULL, desc_info->cb_arg);
 +              if (desc_info->cb_arg) {
 +                      pkts++;
 +                      bytes += desc_info->bytes;
 +              }
                desc_info->cb = NULL;
                desc_info->cb_arg = NULL;
        }
 +
 +      if (pkts && bytes)
 +              netdev_tx_completed_queue(q_to_ndq(q), pkts, bytes);
  }
  
  static int ionic_tx_tcp_inner_pseudo_csum(struct sk_buff *skb)
@@@ -825,33 -756,50 +825,33 @@@ static void ionic_tx_tso_post(struct io
        desc->hdr_len = cpu_to_le16(hdrlen);
        desc->mss = cpu_to_le16(mss);
  
 -      if (done) {
 +      if (start) {
                skb_tx_timestamp(skb);
                netdev_tx_sent_queue(q_to_ndq(q), skb->len);
 -              ionic_txq_post(q, !netdev_xmit_more(), ionic_tx_clean, skb);
 +              ionic_txq_post(q, false, ionic_tx_clean, skb);
        } else {
 -              ionic_txq_post(q, false, ionic_tx_clean, NULL);
 +              ionic_txq_post(q, done, NULL, NULL);
        }
  }
  
 -static struct ionic_txq_desc *ionic_tx_tso_next(struct ionic_queue *q,
 -                                              struct ionic_txq_sg_elem **elem)
 -{
 -      struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
 -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
 -
 -      *elem = sg_desc->elems;
 -      return desc;
 -}
 -
  static int ionic_tx_tso(struct ionic_queue *q, struct sk_buff *skb)
  {
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct ionic_desc_info *rewind_desc_info;
 -      struct device *dev = q->lif->ionic->dev;
 +      struct ionic_desc_info *desc_info;
 +      struct ionic_buf_info *buf_info;
        struct ionic_txq_sg_elem *elem;
        struct ionic_txq_desc *desc;
 -      unsigned int frag_left = 0;
 -      unsigned int offset = 0;
 -      u16 abort = q->head_idx;
 -      unsigned int len_left;
 +      unsigned int chunk_len;
 +      unsigned int frag_rem;
 +      unsigned int tso_rem;
 +      unsigned int seg_rem;
        dma_addr_t desc_addr;
 +      dma_addr_t frag_addr;
        unsigned int hdrlen;
 -      unsigned int nfrags;
 -      unsigned int seglen;
 -      u64 total_bytes = 0;
 -      u64 total_pkts = 0;
 -      u16 rewind = abort;
 -      unsigned int left;
        unsigned int len;
        unsigned int mss;
 -      skb_frag_t *frag;
        bool start, done;
        bool outer_csum;
 -      dma_addr_t addr;
        bool has_vlan;
        u16 desc_len;
        u8 desc_nsge;
        bool encap;
        int err;
  
 +      desc_info = &q->info[q->head_idx];
 +      buf_info = desc_info->bufs;
 +
 +      if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
 +              return -EIO;
 +
 +      len = skb->len;
        mss = skb_shinfo(skb)->gso_size;
 -      nfrags = skb_shinfo(skb)->nr_frags;
 -      len_left = skb->len - skb_headlen(skb);
        outer_csum = (skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM) ||
                     (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
        has_vlan = !!skb_vlan_tag_present(skb);
        else
                hdrlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
  
 -      seglen = hdrlen + mss;
 -      left = skb_headlen(skb);
 +      tso_rem = len;
 +      seg_rem = min(tso_rem, hdrlen + mss);
  
 -      desc = ionic_tx_tso_next(q, &elem);
 -      start = true;
 +      frag_addr = 0;
 +      frag_rem = 0;
  
 -      /* Chop skb->data up into desc segments */
 +      start = true;
  
 -      while (left > 0) {
 -              len = min(seglen, left);
 -              frag_left = seglen - len;
 -              desc_addr = ionic_tx_map_single(q, skb->data + offset, len);
 -              if (dma_mapping_error(dev, desc_addr))
 -                      goto err_out_abort;
 -              desc_len = len;
 +      while (tso_rem > 0) {
 +              desc = NULL;
 +              elem = NULL;
 +              desc_addr = 0;
 +              desc_len = 0;
                desc_nsge = 0;
 -              left -= len;
 -              offset += len;
 -              if (nfrags > 0 && frag_left > 0)
 -                      continue;
 -              done = (nfrags == 0 && left == 0);
 -              ionic_tx_tso_post(q, desc, skb,
 -                                desc_addr, desc_nsge, desc_len,
 -                                hdrlen, mss,
 -                                outer_csum,
 -                                vlan_tci, has_vlan,
 -                                start, done);
 -              total_pkts++;
 -              total_bytes += start ? len : len + hdrlen;
 -              desc = ionic_tx_tso_next(q, &elem);
 -              start = false;
 -              seglen = mss;
 -      }
 -
 -      /* Chop skb frags into desc segments */
 -
 -      for (frag = skb_shinfo(skb)->frags; len_left; frag++) {
 -              offset = 0;
 -              left = skb_frag_size(frag);
 -              len_left -= left;
 -              nfrags--;
 -              stats->frags++;
 -
 -              while (left > 0) {
 -                      if (frag_left > 0) {
 -                              len = min(frag_left, left);
 -                              frag_left -= len;
 -                              addr = ionic_tx_map_frag(q, frag, offset, len);
 -                              if (dma_mapping_error(dev, addr))
 -                                      goto err_out_abort;
 -                              elem->addr = cpu_to_le64(addr);
 -                              elem->len = cpu_to_le16(len);
 +              /* use fragments until we have enough to post a single descriptor */
 +              while (seg_rem > 0) {
 +                      /* if the fragment is exhausted then move to the next one */
 +                      if (frag_rem == 0) {
 +                              /* grab the next fragment */
 +                              frag_addr = buf_info->dma_addr;
 +                              frag_rem = buf_info->len;
 +                              buf_info++;
 +                      }
 +                      chunk_len = min(frag_rem, seg_rem);
 +                      if (!desc) {
 +                              /* fill main descriptor */
 +                              desc = desc_info->txq_desc;
 +                              elem = desc_info->txq_sg_desc->elems;
 +                              desc_addr = frag_addr;
 +                              desc_len = chunk_len;
 +                      } else {
 +                              /* fill sg descriptor */
 +                              elem->addr = cpu_to_le64(frag_addr);
 +                              elem->len = cpu_to_le16(chunk_len);
                                elem++;
                                desc_nsge++;
 -                              left -= len;
 -                              offset += len;
 -                              if (nfrags > 0 && frag_left > 0)
 -                                      continue;
 -                              done = (nfrags == 0 && left == 0);
 -                              ionic_tx_tso_post(q, desc, skb, desc_addr,
 -                                                desc_nsge, desc_len,
 -                                                hdrlen, mss, outer_csum,
 -                                                vlan_tci, has_vlan,
 -                                                start, done);
 -                              total_pkts++;
 -                              total_bytes += start ? len : len + hdrlen;
 -                              desc = ionic_tx_tso_next(q, &elem);
 -                              start = false;
 -                      } else {
 -                              len = min(mss, left);
 -                              frag_left = mss - len;
 -                              desc_addr = ionic_tx_map_frag(q, frag,
 -                                                            offset, len);
 -                              if (dma_mapping_error(dev, desc_addr))
 -                                      goto err_out_abort;
 -                              desc_len = len;
 -                              desc_nsge = 0;
 -                              left -= len;
 -                              offset += len;
 -                              if (nfrags > 0 && frag_left > 0)
 -                                      continue;
 -                              done = (nfrags == 0 && left == 0);
 -                              ionic_tx_tso_post(q, desc, skb, desc_addr,
 -                                                desc_nsge, desc_len,
 -                                                hdrlen, mss, outer_csum,
 -                                                vlan_tci, has_vlan,
 -                                                start, done);
 -                              total_pkts++;
 -                              total_bytes += start ? len : len + hdrlen;
 -                              desc = ionic_tx_tso_next(q, &elem);
 -                              start = false;
                        }
 +                      frag_addr += chunk_len;
 +                      frag_rem -= chunk_len;
 +                      tso_rem -= chunk_len;
 +                      seg_rem -= chunk_len;
                }
 +              seg_rem = min(tso_rem, mss);
 +              done = (tso_rem == 0);
 +              /* post descriptor */
 +              ionic_tx_tso_post(q, desc, skb,
 +                                desc_addr, desc_nsge, desc_len,
 +                                hdrlen, mss, outer_csum, vlan_tci, has_vlan,
 +                                start, done);
 +              start = false;
 +              /* Buffer information is stored with the first tso descriptor */
 +              desc_info = &q->info[q->head_idx];
 +              desc_info->nbufs = 0;
        }
  
 -      stats->pkts += total_pkts;
 -      stats->bytes += total_bytes;
 +      stats->pkts += DIV_ROUND_UP(len - hdrlen, mss);
 +      stats->bytes += len;
        stats->tso++;
 -      stats->tso_bytes += total_bytes;
 +      stats->tso_bytes = len;
  
        return 0;
 -
 -err_out_abort:
 -      while (rewind != q->head_idx) {
 -              rewind_desc_info = &q->info[rewind];
 -              ionic_tx_clean(q, rewind_desc_info, NULL, NULL);
 -              rewind = (rewind + 1) & (q->num_descs - 1);
 -      }
 -      q->head_idx = abort;
 -
 -      return -ENOMEM;
  }
  
 -static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb)
 +static int ionic_tx_calc_csum(struct ionic_queue *q, struct sk_buff *skb,
 +                            struct ionic_desc_info *desc_info)
  {
 -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
 +      struct ionic_txq_desc *desc = desc_info->txq_desc;
 +      struct ionic_buf_info *buf_info = desc_info->bufs;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct device *dev = q->lif->ionic->dev;
 -      dma_addr_t dma_addr;
        bool has_vlan;
        u8 flags = 0;
        bool encap;
        has_vlan = !!skb_vlan_tag_present(skb);
        encap = skb->encapsulation;
  
 -      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
 -      if (dma_mapping_error(dev, dma_addr))
 -              return -ENOMEM;
 -
        flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
        flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
  
        cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_PARTIAL,
 -                                flags, skb_shinfo(skb)->nr_frags, dma_addr);
 +                                flags, skb_shinfo(skb)->nr_frags,
 +                                buf_info->dma_addr);
        desc->cmd = cpu_to_le64(cmd);
 -      desc->len = cpu_to_le16(skb_headlen(skb));
 -      desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
 -      desc->csum_offset = cpu_to_le16(skb->csum_offset);
 +      desc->len = cpu_to_le16(buf_info->len);
        if (has_vlan) {
                desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
                stats->vlan_inserted++;
 +      } else {
 +              desc->vlan_tci = 0;
        }
 +      desc->csum_start = cpu_to_le16(skb_checksum_start_offset(skb));
 +      desc->csum_offset = cpu_to_le16(skb->csum_offset);
  
        if (skb_csum_is_sctp(skb))
                stats->crc32_csum++;
        return 0;
  }
  
 -static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb)
 +static int ionic_tx_calc_no_csum(struct ionic_queue *q, struct sk_buff *skb,
 +                               struct ionic_desc_info *desc_info)
  {
 -      struct ionic_txq_desc *desc = q->info[q->head_idx].txq_desc;
 +      struct ionic_txq_desc *desc = desc_info->txq_desc;
 +      struct ionic_buf_info *buf_info = desc_info->bufs;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct device *dev = q->lif->ionic->dev;
 -      dma_addr_t dma_addr;
        bool has_vlan;
        u8 flags = 0;
        bool encap;
        has_vlan = !!skb_vlan_tag_present(skb);
        encap = skb->encapsulation;
  
 -      dma_addr = ionic_tx_map_single(q, skb->data, skb_headlen(skb));
 -      if (dma_mapping_error(dev, dma_addr))
 -              return -ENOMEM;
 -
        flags |= has_vlan ? IONIC_TXQ_DESC_FLAG_VLAN : 0;
        flags |= encap ? IONIC_TXQ_DESC_FLAG_ENCAP : 0;
  
        cmd = encode_txq_desc_cmd(IONIC_TXQ_DESC_OPCODE_CSUM_NONE,
 -                                flags, skb_shinfo(skb)->nr_frags, dma_addr);
 +                                flags, skb_shinfo(skb)->nr_frags,
 +                                buf_info->dma_addr);
        desc->cmd = cpu_to_le64(cmd);
 -      desc->len = cpu_to_le16(skb_headlen(skb));
 +      desc->len = cpu_to_le16(buf_info->len);
        if (has_vlan) {
                desc->vlan_tci = cpu_to_le16(skb_vlan_tag_get(skb));
                stats->vlan_inserted++;
 +      } else {
 +              desc->vlan_tci = 0;
        }
 +      desc->csum_start = 0;
 +      desc->csum_offset = 0;
  
        stats->csum_none++;
  
        return 0;
  }
  
 -static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb)
 +static int ionic_tx_skb_frags(struct ionic_queue *q, struct sk_buff *skb,
 +                            struct ionic_desc_info *desc_info)
  {
 -      struct ionic_txq_sg_desc *sg_desc = q->info[q->head_idx].txq_sg_desc;
 -      unsigned int len_left = skb->len - skb_headlen(skb);
 +      struct ionic_txq_sg_desc *sg_desc = desc_info->txq_sg_desc;
 +      struct ionic_buf_info *buf_info = &desc_info->bufs[1];
        struct ionic_txq_sg_elem *elem = sg_desc->elems;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
 -      struct device *dev = q->lif->ionic->dev;
 -      dma_addr_t dma_addr;
 -      skb_frag_t *frag;
 -      u16 len;
 +      unsigned int i;
  
 -      for (frag = skb_shinfo(skb)->frags; len_left; frag++, elem++) {
 -              len = skb_frag_size(frag);
 -              elem->len = cpu_to_le16(len);
 -              dma_addr = ionic_tx_map_frag(q, frag, 0, len);
 -              if (dma_mapping_error(dev, dma_addr))
 -                      return -ENOMEM;
 -              elem->addr = cpu_to_le64(dma_addr);
 -              len_left -= len;
 -              stats->frags++;
 +      for (i = 0; i < skb_shinfo(skb)->nr_frags; i++, buf_info++, elem++) {
 +              elem->addr = cpu_to_le64(buf_info->dma_addr);
 +              elem->len = cpu_to_le16(buf_info->len);
        }
  
 +      stats->frags += skb_shinfo(skb)->nr_frags;
 +
        return 0;
  }
  
  static int ionic_tx(struct ionic_queue *q, struct sk_buff *skb)
  {
 +      struct ionic_desc_info *desc_info = &q->info[q->head_idx];
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
        int err;
  
 +      if (unlikely(ionic_tx_map_skb(q, skb, desc_info)))
 +              return -EIO;
 +
        /* set up the initial descriptor */
        if (skb->ip_summed == CHECKSUM_PARTIAL)
 -              err = ionic_tx_calc_csum(q, skb);
 +              err = ionic_tx_calc_csum(q, skb, desc_info);
        else
 -              err = ionic_tx_calc_no_csum(q, skb);
 +              err = ionic_tx_calc_no_csum(q, skb, desc_info);
        if (err)
                return err;
  
        /* add frags */
 -      err = ionic_tx_skb_frags(q, skb);
 +      err = ionic_tx_skb_frags(q, skb, desc_info);
        if (err)
                return err;
  
  
  static int ionic_tx_descs_needed(struct ionic_queue *q, struct sk_buff *skb)
  {
 -      int sg_elems = q->lif->qtype_info[IONIC_QTYPE_TXQ].max_sg_elems;
        struct ionic_tx_stats *stats = q_to_tx_stats(q);
+       int ndescs;
        int err;
  
-       /* If TSO, need roundup(skb->len/mss) descs */
+       /* Each desc is mss long max, so a descriptor for each gso_seg */
        if (skb_is_gso(skb))
-               return (skb->len / skb_shinfo(skb)->gso_size) + 1;
+               ndescs = skb_shinfo(skb)->gso_segs;
+       else
+               ndescs = 1;
  
 -      if (skb_shinfo(skb)->nr_frags <= sg_elems)
 +      /* If non-TSO, just need 1 desc and nr_frags sg elems */
 +      if (skb_shinfo(skb)->nr_frags <= q->max_sg_elems)
-               return 1;
+               return ndescs;
  
        /* Too many frags, so linearize */
        err = skb_linearize(skb);
  
        stats->linearize++;
  
-       /* Need 1 desc and zero sg elems */
-       return 1;
+       return ndescs;
  }
  
  static int ionic_maybe_stop_tx(struct ionic_queue *q, int ndescs)
@@@ -1586,10 -1586,12 +1586,10 @@@ DECLARE_RTL_COND(rtl_counters_cond
  
  static void rtl8169_do_counters(struct rtl8169_private *tp, u32 counter_cmd)
  {
 -      dma_addr_t paddr = tp->counters_phys_addr;
 -      u32 cmd;
 +      u32 cmd = lower_32_bits(tp->counters_phys_addr);
  
 -      RTL_W32(tp, CounterAddrHigh, (u64)paddr >> 32);
 +      RTL_W32(tp, CounterAddrHigh, upper_32_bits(tp->counters_phys_addr));
        rtl_pci_commit(tp);
 -      cmd = (u64)paddr & DMA_BIT_MASK(32);
        RTL_W32(tp, CounterAddrLow, cmd);
        RTL_W32(tp, CounterAddrLow, cmd | counter_cmd);
  
@@@ -1901,15 -1903,6 +1901,15 @@@ static int rtl8169_set_eee(struct net_d
        return ret;
  }
  
 +static void rtl8169_get_ringparam(struct net_device *dev,
 +                                struct ethtool_ringparam *data)
 +{
 +      data->rx_max_pending = NUM_RX_DESC;
 +      data->rx_pending = NUM_RX_DESC;
 +      data->tx_max_pending = NUM_TX_DESC;
 +      data->tx_pending = NUM_TX_DESC;
 +}
 +
  static const struct ethtool_ops rtl8169_ethtool_ops = {
        .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
                                     ETHTOOL_COALESCE_MAX_FRAMES,
        .set_eee                = rtl8169_set_eee,
        .get_link_ksettings     = phy_ethtool_get_link_ksettings,
        .set_link_ksettings     = phy_ethtool_set_link_ksettings,
 +      .get_ringparam          = rtl8169_get_ringparam,
  };
  
  static void rtl_enable_eee(struct rtl8169_private *tp)
@@@ -4654,6 -4646,9 +4654,9 @@@ static void rtl8169_down(struct rtl8169
  
        rtl8169_update_counters(tp);
  
+       pci_clear_master(tp->pci_dev);
+       rtl_pci_commit(tp);
        rtl8169_cleanup(tp, true);
  
        rtl_prepare_power_down(tp);
  
  static void rtl8169_up(struct rtl8169_private *tp)
  {
+       pci_set_master(tp->pci_dev);
        phy_resume(tp->phydev);
        rtl8169_init_phy(tp);
        napi_enable(&tp->napi);
@@@ -5315,8 -5311,6 +5319,6 @@@ static int rtl_init_one(struct pci_dev 
  
        rtl_hw_reset(tp);
  
-       pci_set_master(pdev);
        rc = rtl_alloc_irq(tp);
        if (rc < 0) {
                dev_err(&pdev->dev, "Can't allocate interrupt\n");
@@@ -71,12 -71,13 +71,12 @@@ struct ipa_cmd_hw_hdr_init_local 
  
  /* IPA_CMD_REGISTER_WRITE */
  
 -/* For IPA v4.0+, this opcode gets modified with pipeline clear options */
 -
 +/* For IPA v4.0+, the pipeline clear options are encoded in the opcode */
  #define REGISTER_WRITE_OPCODE_SKIP_CLEAR_FMASK                GENMASK(8, 8)
  #define REGISTER_WRITE_OPCODE_CLEAR_OPTION_FMASK      GENMASK(10, 9)
  
  struct ipa_cmd_register_write {
 -      __le16 flags;           /* Unused/reserved for IPA v3.5.1 */
 +      __le16 flags;           /* Unused/reserved prior to IPA v4.0 */
        __le16 offset;
        __le32 value;
        __le32 value_mask;
  };
  
  /* Field masks for ipa_cmd_register_write structure fields */
 -/* The next field is present for IPA v4.0 and above */
 +/* The next field is present for IPA v4.0+ */
  #define REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK                GENMASK(14, 11)
 -/* The next field is present for IPA v3.5.1 only */
 +/* The next field is not present for IPA v4.0+ */
  #define REGISTER_WRITE_FLAGS_SKIP_CLEAR_FMASK         GENMASK(15, 15)
  
 -/* The next field and its values are present for IPA v3.5.1 only */
 +/* The next field and its values are not present for IPA v4.0+ */
  #define REGISTER_WRITE_CLEAR_OPTIONS_FMASK            GENMASK(1, 0)
  
  /* IPA_CMD_IP_PACKET_INIT */
@@@ -122,7 -123,7 +122,7 @@@ struct ipa_cmd_hw_dma_mem_mem 
  
  /* Field masks for ipa_cmd_hw_dma_mem_mem structure fields */
  #define DMA_SHARED_MEM_FLAGS_DIRECTION_FMASK          GENMASK(0, 0)
 -/* The next two fields are present for IPA v3.5.1 only. */
 +/* The next two fields are not present for IPA v4.0+ */
  #define DMA_SHARED_MEM_FLAGS_SKIP_CLEAR_FMASK         GENMASK(1, 1)
  #define DMA_SHARED_MEM_FLAGS_CLEAR_OPTIONS_FMASK      GENMASK(3, 2)
  
@@@ -174,21 -175,23 +174,23 @@@ bool ipa_cmd_table_valid(struct ipa *ip
                            : field_max(IP_FLTRT_FLAGS_NHASH_ADDR_FMASK);
        if (mem->offset > offset_max ||
            ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             ipa->mem_offset, mem->offset, offset_max);
+               dev_err(dev, "IPv%c %s%s table region offset too large\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
                return false;
        }
  
        if (mem->offset > ipa->mem_size ||
            mem->size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "IPv%c %s%s table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipv6 ? '6' : '4', hashed ? "hashed " : "",
-                             route ? "route" : "filter",
-                             mem->offset, mem->size, ipa->mem_size);
+               dev_err(dev, "IPv%c %s%s table region out of range\n",
+                       ipv6 ? '6' : '4', hashed ? "hashed " : "",
+                       route ? "route" : "filter");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, mem->size, ipa->mem_size);
                return false;
        }
  
@@@ -204,22 -207,36 +206,36 @@@ static bool ipa_cmd_header_valid(struc
        u32 size_max;
        u32 size;
  
+       /* In ipa_cmd_hdr_init_local_add() we record the offset and size
+        * of the header table memory area.  Make sure the offset and size
+        * fit in the fields that need to hold them, and that the entire
+        * range is within the overall IPA memory range.
+        */
        offset_max = field_max(HDR_INIT_LOCAL_FLAGS_HDR_ADDR_FMASK);
        if (mem->offset > offset_max ||
            ipa->mem_offset > offset_max - mem->offset) {
-               dev_err(dev, "header table region offset too large "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             ipa->mem_offset + mem->offset, offset_max);
+               dev_err(dev, "header table region offset too large\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       ipa->mem_offset, mem->offset, offset_max);
                return false;
        }
  
        size_max = field_max(HDR_INIT_LOCAL_FLAGS_TABLE_SIZE_FMASK);
        size = ipa->mem[IPA_MEM_MODEM_HEADER].size;
        size += ipa->mem[IPA_MEM_AP_HEADER].size;
-       if (mem->offset > ipa->mem_size || size > ipa->mem_size - mem->offset) {
-               dev_err(dev, "header table region out of range "
-                             "(0x%04x + 0x%04x > 0x%04x)\n",
-                             mem->offset, size, ipa->mem_size);
+       if (size > size_max) {
+               dev_err(dev, "header table region size too large\n");
+               dev_err(dev, "    (0x%04x > 0x%08x)\n", size, size_max);
+               return false;
+       }
+       if (size > ipa->mem_size || mem->offset > ipa->mem_size - size) {
+               dev_err(dev, "header table region out of range\n");
+               dev_err(dev, "    (0x%04x + 0x%04x > 0x%04x)\n",
+                       mem->offset, size, ipa->mem_size);
                return false;
        }
  
@@@ -236,12 -253,11 +252,12 @@@ static bool ipa_cmd_register_write_offs
        u32 bit_count;
  
        /* The maximum offset in a register_write immediate command depends
 -       * on the version of IPA.  IPA v3.5.1 supports a 16 bit offset, but
 -       * newer versions allow some additional high-order bits.
 +       * on the version of IPA.  A 16 bit offset is always supported,
 +       * but starting with IPA v4.0 some additional high-order bits are
 +       * allowed.
         */
        bit_count = BITS_PER_BYTE * sizeof(payload->offset);
 -      if (ipa->version != IPA_VERSION_3_5_1)
 +      if (ipa->version >= IPA_VERSION_4_0)
                bit_count += hweight32(REGISTER_WRITE_FLAGS_OFFSET_HIGH_FMASK);
        BUILD_BUG_ON(bit_count > 32);
        offset_max = ~0U >> (32 - bit_count);
@@@ -440,11 -456,7 +456,11 @@@ void ipa_cmd_register_write_add(struct 
        /* pipeline_clear_src_grp is not used */
        clear_option = clear_full ? pipeline_clear_full : pipeline_clear_hps;
  
 -      if (ipa->version != IPA_VERSION_3_5_1) {
 +      /* IPA v4.0+ represents the pipeline clear options in the opcode.  It
 +       * also supports a larger offset by encoding additional high-order
 +       * bits in the payload flags field.
 +       */
 +      if (ipa->version >= IPA_VERSION_4_0) {
                u16 offset_high;
                u32 val;
  
@@@ -249,6 -249,7 +249,7 @@@ static const struct qmi_msg_handler ipa
                .decoded_size   = IPA_QMI_DRIVER_INIT_COMPLETE_REQ_SZ,
                .fn             = ipa_server_driver_init_complete,
        },
+       { },
  };
  
  /* Handle an INIT_DRIVER response message from the modem. */
@@@ -269,6 -270,7 +270,7 @@@ static const struct qmi_msg_handler ipa
                .decoded_size   = IPA_QMI_INIT_DRIVER_RSP_SZ,
                .fn             = ipa_client_init_driver,
        },
+       { },
  };
  
  /* Return a pointer to an init modem driver request structure, which contains
@@@ -377,8 -379,8 +379,8 @@@ init_modem_driver_req(struct ipa_qmi *i
  
        /* None of the stats fields are valid (IPA v4.0 and above) */
  
 -      if (ipa->version != IPA_VERSION_3_5_1) {
 -              mem = &ipa->mem[IPA_MEM_STATS_QUOTA];
 +      if (ipa->version >= IPA_VERSION_4_0) {
 +              mem = &ipa->mem[IPA_MEM_STATS_QUOTA_MODEM];
                if (mem->size) {
                        req.hw_stats_quota_base_addr_valid = 1;
                        req.hw_stats_quota_base_addr =
@@@ -271,9 -271,8 +271,9 @@@ static int phylink_parse_mode(struct ph
                pl->cfg_link_an_mode = MLO_AN_FIXED;
        fwnode_handle_put(dn);
  
 -      if (fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
 -          strcmp(managed, "in-band-status") == 0) {
 +      if ((fwnode_property_read_string(fwnode, "managed", &managed) == 0 &&
 +           strcmp(managed, "in-band-status") == 0) ||
 +          pl->config->ovr_an_inband) {
                if (pl->cfg_link_an_mode == MLO_AN_FIXED) {
                        phylink_err(pl,
                                    "can't use both fixed-link and in-band-status\n");
@@@ -477,7 -476,7 +477,7 @@@ static void phylink_major_config(struc
                err = pl->mac_ops->mac_finish(pl->config, pl->cur_link_an_mode,
                                              state->interface);
                if (err < 0)
-                       phylink_err(pl, "mac_prepare failed: %pe\n",
+                       phylink_err(pl, "mac_finish failed: %pe\n",
                                    ERR_PTR(err));
        }
  }
diff --combined include/linux/bpf.h
@@@ -21,6 -21,7 +21,7 @@@
  #include <linux/capability.h>
  #include <linux/sched/mm.h>
  #include <linux/slab.h>
+ #include <linux/percpu-refcount.h>
  
  struct bpf_verifier_env;
  struct bpf_verifier_log;
@@@ -39,7 -40,6 +40,7 @@@ struct bpf_local_storage
  struct bpf_local_storage_map;
  struct kobject;
  struct mem_cgroup;
 +struct bpf_func_state;
  
  extern struct idr btf_idr;
  extern spinlock_t btf_idr_lock;
@@@ -118,9 -118,6 +119,9 @@@ struct bpf_map_ops 
                                           void *owner, u32 size);
        struct bpf_local_storage __rcu ** (*map_owner_storage_ptr)(void *owner);
  
 +      /* Misc helpers.*/
 +      int (*map_redirect)(struct bpf_map *map, u32 ifindex, u64 flags);
 +
        /* map_meta_equal must be implemented for maps that can be
         * used as an inner map.  It is a runtime check to ensure
         * an inner map can be inserted to an outer map.
        bool (*map_meta_equal)(const struct bpf_map *meta0,
                               const struct bpf_map *meta1);
  
 +
 +      int (*map_set_for_each_callback_args)(struct bpf_verifier_env *env,
 +                                            struct bpf_func_state *caller,
 +                                            struct bpf_func_state *callee);
 +      int (*map_for_each_callback)(struct bpf_map *map, void *callback_fn,
 +                                   void *callback_ctx, u64 flags);
 +
        /* BTF name and id of struct allocated by map_alloc */
        const char * const map_btf_name;
        int *map_btf_id;
@@@ -306,8 -296,6 +307,8 @@@ enum bpf_arg_type 
        ARG_CONST_ALLOC_SIZE_OR_ZERO,   /* number of allocated bytes requested */
        ARG_PTR_TO_BTF_ID_SOCK_COMMON,  /* pointer to in-kernel sock_common or bpf-mirrored bpf_sock */
        ARG_PTR_TO_PERCPU_BTF_ID,       /* pointer to in-kernel percpu type */
 +      ARG_PTR_TO_FUNC,        /* pointer to a bpf program function */
 +      ARG_PTR_TO_STACK_OR_NULL,       /* pointer to stack or NULL */
        __BPF_ARG_TYPE_MAX,
  };
  
@@@ -424,8 -412,6 +425,8 @@@ enum bpf_reg_type 
        PTR_TO_RDWR_BUF,         /* reg points to a read/write buffer */
        PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
        PTR_TO_PERCPU_BTF_ID,    /* reg points to a percpu kernel variable */
 +      PTR_TO_FUNC,             /* reg points to a bpf program function */
 +      PTR_TO_MAP_KEY,          /* reg points to a map element key */
  };
  
  /* The information passed from prog-specific *_is_valid_access
@@@ -521,11 -507,6 +522,11 @@@ enum bpf_cgroup_storage_type 
   */
  #define MAX_BPF_FUNC_ARGS 12
  
 +/* The maximum number of arguments passed through registers
 + * a single function may have.
 + */
 +#define MAX_BPF_FUNC_REG_ARGS 5
 +
  struct btf_func_model {
        u8 ret_size;
        u8 nr_args;
@@@ -576,7 -557,8 +577,8 @@@ struct bpf_tramp_progs 
   *      fentry = a set of program to run before calling original function
   *      fexit = a set of program to run after original function
   */
- int arch_prepare_bpf_trampoline(void *image, void *image_end,
+ struct bpf_tramp_image;
+ int arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
                                const struct btf_func_model *m, u32 flags,
                                struct bpf_tramp_progs *tprogs,
                                void *orig_call);
@@@ -585,6 -567,8 +587,8 @@@ u64 notrace __bpf_prog_enter(struct bpf
  void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start);
  u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog);
  void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start);
+ void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr);
+ void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr);
  
  struct bpf_ksym {
        unsigned long            start;
@@@ -603,6 -587,18 +607,18 @@@ enum bpf_tramp_prog_type 
        BPF_TRAMP_REPLACE, /* more than MAX */
  };
  
+ struct bpf_tramp_image {
+       void *image;
+       struct bpf_ksym ksym;
+       struct percpu_ref pcref;
+       void *ip_after_call;
+       void *ip_epilogue;
+       union {
+               struct rcu_head rcu;
+               struct work_struct work;
+       };
+ };
  struct bpf_trampoline {
        /* hlist for trampoline_table */
        struct hlist_node hlist;
        /* Number of attached programs. A counter per kind. */
        int progs_cnt[BPF_TRAMP_MAX];
        /* Executable image of trampoline */
-       void *image;
+       struct bpf_tramp_image *cur_image;
        u64 selector;
-       struct bpf_ksym ksym;
  };
  
  struct bpf_attach_target_info {
@@@ -711,6 -706,8 +726,8 @@@ void bpf_image_ksym_add(void *data, str
  void bpf_image_ksym_del(struct bpf_ksym *ksym);
  void bpf_ksym_add(struct bpf_ksym *ksym);
  void bpf_ksym_del(struct bpf_ksym *ksym);
+ int bpf_jit_charge_modmem(u32 pages);
+ void bpf_jit_uncharge_modmem(u32 pages);
  #else
  static inline int bpf_trampoline_link_prog(struct bpf_prog *prog,
                                           struct bpf_trampoline *tr)
@@@ -807,7 -804,6 +824,6 @@@ struct bpf_prog_aux 
        bool func_proto_unreliable;
        bool sleepable;
        bool tail_call_reachable;
-       enum bpf_tramp_prog_type trampoline_prog_type;
        struct hlist_node tramp_hlist;
        /* BTF_KIND_FUNC_PROTO for valid attach_btf_id */
        const struct btf_type *attach_func_proto;
@@@ -1113,7 -1109,7 +1129,7 @@@ int bpf_prog_array_copy(struct bpf_prog
                _ret;                                                   \
         })
  
- #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null)        \
+ #define __BPF_PROG_RUN_ARRAY(array, ctx, func, check_non_null, set_cg_storage)        \
        ({                                              \
                struct bpf_prog_array_item *_item;      \
                struct bpf_prog *_prog;                 \
                        goto _out;                      \
                _item = &_array->items[0];              \
                while ((_prog = READ_ONCE(_item->prog))) {              \
-                       bpf_cgroup_storage_set(_item->cgroup_storage);  \
+                       if (set_cg_storage)             \
+                               bpf_cgroup_storage_set(_item->cgroup_storage);  \
                        _ret &= func(_prog, ctx);       \
                        _item++;                        \
                }                                       \
@@@ -1173,10 -1170,10 +1190,10 @@@ _out:                                                        
        })
  
  #define BPF_PROG_RUN_ARRAY(array, ctx, func)          \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, false)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, false, true)
  
  #define BPF_PROG_RUN_ARRAY_CHECK(array, ctx, func)    \
-       __BPF_PROG_RUN_ARRAY(array, ctx, func, true)
+       __BPF_PROG_RUN_ARRAY(array, ctx, func, true, false)
  
  #ifdef CONFIG_BPF_SYSCALL
  DECLARE_PER_CPU(int, bpf_prog_active);
@@@ -1400,10 -1397,6 +1417,10 @@@ void bpf_iter_map_show_fdinfo(const str
  int bpf_iter_map_fill_link_info(const struct bpf_iter_aux_info *aux,
                                struct bpf_link_info *info);
  
 +int map_set_for_each_callback_args(struct bpf_verifier_env *env,
 +                                 struct bpf_func_state *caller,
 +                                 struct bpf_func_state *callee);
 +
  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
  int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
@@@ -1453,9 -1446,9 +1470,9 @@@ struct btf *bpf_get_btf_vmlinux(void)
  /* Map specifics */
  struct xdp_buff;
  struct sk_buff;
 +struct bpf_dtab_netdev;
 +struct bpf_cpu_map_entry;
  
 -struct bpf_dtab_netdev *__dev_map_lookup_elem(struct bpf_map *map, u32 key);
 -struct bpf_dtab_netdev *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key);
  void __dev_flush(void);
  int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
                    struct net_device *dev_rx);
@@@ -1465,6 -1458,7 +1482,6 @@@ int dev_map_generic_redirect(struct bpf
                             struct bpf_prog *xdp_prog);
  bool dev_map_can_have_prog(struct bpf_map *map);
  
 -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key);
  void __cpu_map_flush(void);
  int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
                    struct net_device *dev_rx);
@@@ -1493,9 -1487,6 +1510,9 @@@ int bpf_prog_test_run_flow_dissector(st
  int bpf_prog_test_run_raw_tp(struct bpf_prog *prog,
                             const union bpf_attr *kattr,
                             union bpf_attr __user *uattr);
 +int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 +                              const union bpf_attr *kattr,
 +                              union bpf_attr __user *uattr);
  bool btf_ctx_access(int off, int size, enum bpf_access_type type,
                    const struct bpf_prog *prog,
                    struct bpf_insn_access_aux *info);
@@@ -1525,7 -1516,6 +1542,7 @@@ struct bpf_prog *bpf_prog_by_id(u32 id)
  struct bpf_link *bpf_link_by_id(u32 id);
  
  const struct bpf_func_proto *bpf_base_func_proto(enum bpf_func_id func_id);
 +void bpf_task_storage_free(struct task_struct *task);
  #else /* !CONFIG_BPF_SYSCALL */
  static inline struct bpf_prog *bpf_prog_get(u32 ufd)
  {
@@@ -1595,6 -1585,17 +1612,6 @@@ static inline int bpf_obj_get_user(cons
        return -EOPNOTSUPP;
  }
  
 -static inline struct net_device  *__dev_map_lookup_elem(struct bpf_map *map,
 -                                                     u32 key)
 -{
 -      return NULL;
 -}
 -
 -static inline struct net_device  *__dev_map_hash_lookup_elem(struct bpf_map *map,
 -                                                           u32 key)
 -{
 -      return NULL;
 -}
  static inline bool dev_map_can_have_prog(struct bpf_map *map)
  {
        return false;
@@@ -1606,7 -1607,6 +1623,7 @@@ static inline void __dev_flush(void
  
  struct xdp_buff;
  struct bpf_dtab_netdev;
 +struct bpf_cpu_map_entry;
  
  static inline
  int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
@@@ -1631,6 -1631,12 +1648,6 @@@ static inline int dev_map_generic_redir
        return 0;
  }
  
 -static inline
 -struct bpf_cpu_map_entry *__cpu_map_lookup_elem(struct bpf_map *map, u32 key)
 -{
 -      return NULL;
 -}
 -
  static inline void __cpu_map_flush(void)
  {
  }
@@@ -1681,13 -1687,6 +1698,13 @@@ static inline int bpf_prog_test_run_flo
        return -ENOTSUPP;
  }
  
 +static inline int bpf_prog_test_run_sk_lookup(struct bpf_prog *prog,
 +                                            const union bpf_attr *kattr,
 +                                            union bpf_attr __user *uattr)
 +{
 +      return -ENOTSUPP;
 +}
 +
  static inline void bpf_map_put(struct bpf_map *map)
  {
  }
@@@ -1702,10 -1701,6 +1719,10 @@@ bpf_base_func_proto(enum bpf_func_id fu
  {
        return NULL;
  }
 +
 +static inline void bpf_task_storage_free(struct task_struct *task)
 +{
 +}
  #endif /* CONFIG_BPF_SYSCALL */
  
  void __bpf_free_used_btfs(struct bpf_prog_aux *aux,
@@@ -1790,24 -1785,22 +1807,24 @@@ static inline void bpf_map_offload_map_
  }
  #endif /* CONFIG_NET && CONFIG_BPF_SYSCALL */
  
 -#if defined(CONFIG_BPF_STREAM_PARSER)
 -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog,
 -                       struct bpf_prog *old, u32 which);
 +#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
  int sock_map_get_from_fd(const union bpf_attr *attr, struct bpf_prog *prog);
  int sock_map_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype);
  int sock_map_update_elem_sys(struct bpf_map *map, void *key, void *value, u64 flags);
  void sock_map_unhash(struct sock *sk);
  void sock_map_close(struct sock *sk, long timeout);
 +
 +void bpf_sk_reuseport_detach(struct sock *sk);
 +int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
 +                                     void *value);
 +int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
 +                                     void *value, u64 map_flags);
  #else
 -static inline int sock_map_prog_update(struct bpf_map *map,
 -                                     struct bpf_prog *prog,
 -                                     struct bpf_prog *old, u32 which)
 +static inline void bpf_sk_reuseport_detach(struct sock *sk)
  {
 -      return -EOPNOTSUPP;
  }
  
 +#ifdef CONFIG_BPF_SYSCALL
  static inline int sock_map_get_from_fd(const union bpf_attr *attr,
                                       struct bpf_prog *prog)
  {
@@@ -1825,7 -1818,20 +1842,7 @@@ static inline int sock_map_update_elem_
  {
        return -EOPNOTSUPP;
  }
 -#endif /* CONFIG_BPF_STREAM_PARSER */
  
 -#if defined(CONFIG_INET) && defined(CONFIG_BPF_SYSCALL)
 -void bpf_sk_reuseport_detach(struct sock *sk);
 -int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map, void *key,
 -                                     void *value);
 -int bpf_fd_reuseport_array_update_elem(struct bpf_map *map, void *key,
 -                                     void *value, u64 map_flags);
 -#else
 -static inline void bpf_sk_reuseport_detach(struct sock *sk)
 -{
 -}
 -
 -#ifdef CONFIG_BPF_SYSCALL
  static inline int bpf_fd_reuseport_array_lookup_elem(struct bpf_map *map,
                                                     void *key, void *value)
  {
@@@ -1897,9 -1903,6 +1914,9 @@@ extern const struct bpf_func_proto bpf_
  extern const struct bpf_func_proto bpf_ktime_get_coarse_ns_proto;
  extern const struct bpf_func_proto bpf_sock_from_file_proto;
  extern const struct bpf_func_proto bpf_get_socket_ptr_cookie_proto;
 +extern const struct bpf_func_proto bpf_task_storage_get_proto;
 +extern const struct bpf_func_proto bpf_task_storage_delete_proto;
 +extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
  
  const struct bpf_func_proto *bpf_tracing_func_proto(
        enum bpf_func_id func_id, const struct bpf_prog *prog);
@@@ -360,6 -360,7 +360,7 @@@ enum 
        NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
        NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
+       NAPI_STATE_SCHED_THREADED,      /* Napi is currently scheduled in threaded mode */
  };
  
  enum {
        NAPIF_STATE_IN_BUSY_POLL        = BIT(NAPI_STATE_IN_BUSY_POLL),
        NAPIF_STATE_PREFER_BUSY_POLL    = BIT(NAPI_STATE_PREFER_BUSY_POLL),
        NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
+       NAPIF_STATE_SCHED_THREADED      = BIT(NAPI_STATE_SCHED_THREADED),
  };
  
  enum gro_result {
@@@ -754,13 -756,6 +756,13 @@@ struct rx_queue_attribute 
                         const char *buf, size_t len);
  };
  
 +/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
 +enum xps_map_type {
 +      XPS_CPUS = 0,
 +      XPS_RXQS,
 +      XPS_MAPS_MAX,
 +};
 +
  #ifdef CONFIG_XPS
  /*
   * This structure holds an XPS map which can be of variable length.  The
@@@ -778,19 -773,9 +780,19 @@@ struct xps_map 
  
  /*
   * This structure holds all XPS maps for device.  Maps are indexed by CPU.
 + *
 + * We keep track of the number of cpus/rxqs used when the struct is allocated,
 + * in nr_ids. This will help not accessing out-of-bound memory.
 + *
 + * We keep track of the number of traffic classes used when the struct is
 + * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
 + * not crossing its upper bound, as the original dev->num_tc can be updated in
 + * the meantime.
   */
  struct xps_dev_maps {
        struct rcu_head rcu;
 +      unsigned int nr_ids;
 +      s16 num_tc;
        struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
  };
  
@@@ -848,59 -833,6 +850,59 @@@ typedef u16 (*select_queue_fallback_t)(
                                       struct sk_buff *skb,
                                       struct net_device *sb_dev);
  
 +enum net_device_path_type {
 +      DEV_PATH_ETHERNET = 0,
 +      DEV_PATH_VLAN,
 +      DEV_PATH_BRIDGE,
 +      DEV_PATH_PPPOE,
 +      DEV_PATH_DSA,
 +};
 +
 +struct net_device_path {
 +      enum net_device_path_type       type;
 +      const struct net_device         *dev;
 +      union {
 +              struct {
 +                      u16             id;
 +                      __be16          proto;
 +                      u8              h_dest[ETH_ALEN];
 +              } encap;
 +              struct {
 +                      enum {
 +                              DEV_PATH_BR_VLAN_KEEP,
 +                              DEV_PATH_BR_VLAN_TAG,
 +                              DEV_PATH_BR_VLAN_UNTAG,
 +                              DEV_PATH_BR_VLAN_UNTAG_HW,
 +                      }               vlan_mode;
 +                      u16             vlan_id;
 +                      __be16          vlan_proto;
 +              } bridge;
 +              struct {
 +                      int port;
 +                      u16 proto;
 +              } dsa;
 +      };
 +};
 +
 +#define NET_DEVICE_PATH_STACK_MAX     5
 +#define NET_DEVICE_PATH_VLAN_MAX      2
 +
 +struct net_device_path_stack {
 +      int                     num_paths;
 +      struct net_device_path  path[NET_DEVICE_PATH_STACK_MAX];
 +};
 +
 +struct net_device_path_ctx {
 +      const struct net_device *dev;
 +      const u8                *daddr;
 +
 +      int                     num_vlans;
 +      struct {
 +              u16             id;
 +              __be16          proto;
 +      } vlan[NET_DEVICE_PATH_VLAN_MAX];
 +};
 +
  enum tc_setup_type {
        TC_SETUP_QDISC_MQPRIO,
        TC_SETUP_CLSU32,
@@@ -1335,8 -1267,6 +1337,8 @@@ struct netdev_net_notifier 
   * struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
   *    If a device is paired with a peer device, return the peer instance.
   *    The caller must be under RCU read context.
 + * int (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx, struct net_device_path *path);
 + *     Get the forwarding path to reach the real device from the HW destination address
   */
  struct net_device_ops {
        int                     (*ndo_init)(struct net_device *dev);
        int                     (*ndo_tunnel_ctl)(struct net_device *dev,
                                                  struct ip_tunnel_parm *p, int cmd);
        struct net_device *     (*ndo_get_peer_dev)(struct net_device *dev);
 +      int                     (*ndo_fill_forward_path)(struct net_device_path_ctx *ctx,
 +                                                         struct net_device_path *path);
  };
  
  /**
   * @IFF_FAILOVER_SLAVE: device is lower dev of a failover master device
   * @IFF_L3MDEV_RX_HANDLER: only invoke the rx handler of L3 master device
   * @IFF_LIVE_RENAME_OK: rename is allowed while device is up and running
 + * @IFF_TX_SKB_NO_LINEAR: device/driver is capable of xmitting frames with
 + *    skb_headlen(skb) == 0 (data starts from frag0)
   */
  enum netdev_priv_flags {
        IFF_802_1Q_VLAN                 = 1<<0,
        IFF_FAILOVER_SLAVE              = 1<<28,
        IFF_L3MDEV_RX_HANDLER           = 1<<29,
        IFF_LIVE_RENAME_OK              = 1<<30,
 +      IFF_TX_SKB_NO_LINEAR            = 1<<31,
  };
  
  #define IFF_802_1Q_VLAN                       IFF_802_1Q_VLAN
  #define IFF_L3MDEV_SLAVE              IFF_L3MDEV_SLAVE
  #define IFF_TEAM                      IFF_TEAM
  #define IFF_RXFH_CONFIGURED           IFF_RXFH_CONFIGURED
 +#define IFF_PHONY_HEADROOM            IFF_PHONY_HEADROOM
  #define IFF_MACSEC                    IFF_MACSEC
  #define IFF_NO_RX_HANDLER             IFF_NO_RX_HANDLER
  #define IFF_FAILOVER                  IFF_FAILOVER
  #define IFF_FAILOVER_SLAVE            IFF_FAILOVER_SLAVE
  #define IFF_L3MDEV_RX_HANDLER         IFF_L3MDEV_RX_HANDLER
  #define IFF_LIVE_RENAME_OK            IFF_LIVE_RENAME_OK
 +#define IFF_TX_SKB_NO_LINEAR          IFF_TX_SKB_NO_LINEAR
  
  /* Specifies the type of the struct net_device::ml_priv pointer */
  enum netdev_ml_priv_type {
   *    @tx_queue_len:          Max frames per queue allowed
   *    @tx_global_lock:        XXX: need comments on this one
   *    @xdp_bulkq:             XDP device bulk queue
 - *    @xps_cpus_map:          all CPUs map for XPS device
 - *    @xps_rxqs_map:          all RXQs map for XPS device
 + *    @xps_maps:              all CPUs/RXQs maps for XPS device
   *
   *    @xps_maps:      XXX: need comments on this one
   *    @miniq_egress:          clsact qdisc specific data for
   *
   *    @proto_down_reason:     reason a netdev interface is held down
   *    @pcpu_refcnt:           Number of references to this device
 + *    @dev_refcnt:            Number of references to this device
   *    @todo_list:             Delayed register/unregister
   *    @link_watch_list:       XXX: need comments on this one
   *
@@@ -2134,7 -2057,8 +2136,7 @@@ struct net_device 
        struct xdp_dev_bulk_queue __percpu *xdp_bulkq;
  
  #ifdef CONFIG_XPS
 -      struct xps_dev_maps __rcu *xps_cpus_map;
 -      struct xps_dev_maps __rcu *xps_rxqs_map;
 +      struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
  #endif
  #ifdef CONFIG_NET_CLS_ACT
        struct mini_Qdisc __rcu *miniq_egress;
        u32                     proto_down_reason;
  
        struct list_head        todo_list;
 +
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        int __percpu            *pcpu_refcnt;
 +#else
 +      refcount_t              dev_refcnt;
 +#endif
  
        struct list_head        link_watch_list;
  
@@@ -2927,8 -2846,6 +2929,8 @@@ void dev_remove_offload(struct packet_o
  
  int dev_get_iflink(const struct net_device *dev);
  int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb);
 +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 +                        struct net_device_path_stack *stack);
  struct net_device *__dev_get_by_flags(struct net *net, unsigned short flags,
                                      unsigned short mask);
  struct net_device *dev_get_by_name(struct net *net, const char *name);
@@@ -3506,24 -3423,6 +3508,24 @@@ netif_xmit_frozen_or_drv_stopped(const 
        return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN;
  }
  
 +/**
 + *    netdev_queue_set_dql_min_limit - set dql minimum limit
 + *    @dev_queue: pointer to transmit queue
 + *    @min_limit: dql minimum limit
 + *
 + * Forces xmit_more() to return true until the minimum threshold
 + * defined by @min_limit is reached (or until the tx queue is
 + * empty). Warning: to be use with care, misuse will impact the
 + * latency.
 + */
 +static inline void netdev_queue_set_dql_min_limit(struct netdev_queue *dev_queue,
 +                                                unsigned int min_limit)
 +{
 +#ifdef CONFIG_BQL
 +      dev_queue->dql.min_limit = min_limit;
 +#endif
 +}
 +
  /**
   *    netdev_txq_bql_enqueue_prefetchw - prefetch bql data for write
   *    @dev_queue: pointer to transmit queue
@@@ -3789,7 -3688,7 +3791,7 @@@ static inline void netif_wake_subqueue(
  int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
                        u16 index);
  int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 -                        u16 index, bool is_rxqs_map);
 +                        u16 index, enum xps_map_type type);
  
  /**
   *    netif_attr_test_mask - Test a CPU or Rx queue set in a mask
@@@ -3884,7 -3783,7 +3886,7 @@@ static inline int netif_set_xps_queue(s
  
  static inline int __netif_set_xps_queue(struct net_device *dev,
                                        const unsigned long *mask,
 -                                      u16 index, bool is_rxqs_map)
 +                                      u16 index, enum xps_map_type type)
  {
        return 0;
  }
@@@ -4127,11 -4026,7 +4129,11 @@@ void netdev_run_todo(void)
   */
  static inline void dev_put(struct net_device *dev)
  {
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        this_cpu_dec(*dev->pcpu_refcnt);
 +#else
 +      refcount_dec(&dev->dev_refcnt);
 +#endif
  }
  
  /**
   */
  static inline void dev_hold(struct net_device *dev)
  {
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        this_cpu_inc(*dev->pcpu_refcnt);
 +#else
 +      refcount_inc(&dev->dev_refcnt);
 +#endif
  }
  
  /* Carrier loss detection, dial on demand. The functions netif_carrier_on
@@@ -4281,7 -4172,7 +4283,7 @@@ static inline bool netif_oper_up(const 
   *
   * Check if device has not been removed from system.
   */
 -static inline bool netif_device_present(struct net_device *dev)
 +static inline bool netif_device_present(const struct net_device *dev)
  {
        return test_bit(__LINK_STATE_PRESENT, &dev->state);
  }
@@@ -4720,7 -4611,6 +4722,7 @@@ void dev_get_tstats64(struct net_devic
  
  extern int            netdev_max_backlog;
  extern int            netdev_tstamp_prequeue;
 +extern int            netdev_unregister_timeout_secs;
  extern int            weight_p;
  extern int            dev_weight_rx_bias;
  extern int            dev_weight_tx_bias;
@@@ -5397,9 -5287,6 +5399,9 @@@ do {                                                            
  #define PTYPE_HASH_SIZE       (16)
  #define PTYPE_HASH_MASK       (PTYPE_HASH_SIZE - 1)
  
 +extern struct list_head ptype_all __read_mostly;
 +extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 +
  extern struct net_device *blackhole_netdev;
  
  #endif        /* _LINUX_NETDEVICE_H */
diff --combined include/linux/skbuff.h
@@@ -285,6 -285,7 +285,7 @@@ struct nf_bridge_info 
  struct tc_skb_ext {
        __u32 chain;
        __u16 mru;
+       bool post_ct;
  };
  #endif
  
@@@ -656,7 -657,6 +657,7 @@@ typedef unsigned char *sk_buff_data_t
   *    @protocol: Packet protocol from driver
   *    @destructor: Destruct function
   *    @tcp_tsorted_anchor: list structure for TCP (tp->tsorted_sent_queue)
 + *    @_sk_redir: socket redirection information for skmsg
   *    @_nfct: Associated connection, if any (with nfctinfo bits)
   *    @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
   *    @skb_iif: ifindex of device we arrived on
@@@ -756,9 -756,6 +757,9 @@@ struct sk_buff 
                        void            (*destructor)(struct sk_buff *skb);
                };
                struct list_head        tcp_tsorted_anchor;
 +#ifdef CONFIG_NET_SOCK_MSG
 +              unsigned long           _sk_redir;
 +#endif
        };
  
  #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
@@@ -1140,7 -1137,7 +1141,7 @@@ static inline bool skb_fclone_busy(cons
  
        return skb->fclone == SKB_FCLONE_ORIG &&
               refcount_read(&fclones->fclone_ref) > 1 &&
 -             fclones->skb2.sk == sk;
 +             READ_ONCE(fclones->skb2.sk) == sk;
  }
  
  /**
@@@ -1292,10 -1289,10 +1293,10 @@@ __skb_set_sw_hash(struct sk_buff *skb, 
  void __skb_get_hash(struct sk_buff *skb);
  u32 __skb_get_hash_symmetric(const struct sk_buff *skb);
  u32 skb_get_poff(const struct sk_buff *skb);
 -u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 +u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen);
  __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 -                          void *data, int hlen_proto);
 +                          const void *data, int hlen_proto);
  
  static inline __be32 skb_flow_get_ports(const struct sk_buff *skb,
                                        int thoff, u8 ip_proto)
@@@ -1314,8 -1311,9 +1315,8 @@@ bool bpf_flow_dissect(struct bpf_prog *
  bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
 -                      void *target_container,
 -                      void *data, __be16 proto, int nhoff, int hlen,
 -                      unsigned int flags);
 +                      void *target_container, const void *data,
 +                      __be16 proto, int nhoff, int hlen, unsigned int flags);
  
  static inline bool skb_flow_dissect(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
@@@ -1337,9 -1335,9 +1338,9 @@@ static inline bool skb_flow_dissect_flo
  static inline bool
  skb_flow_dissect_flow_keys_basic(const struct net *net,
                                 const struct sk_buff *skb,
 -                               struct flow_keys_basic *flow, void *data,
 -                               __be16 proto, int nhoff, int hlen,
 -                               unsigned int flags)
 +                               struct flow_keys_basic *flow,
 +                               const void *data, __be16 proto,
 +                               int nhoff, int hlen, unsigned int flags)
  {
        memset(flow, 0, sizeof(*flow));
        return __skb_flow_dissect(net, skb, &flow_keys_basic_dissector, flow,
@@@ -3677,13 -3675,14 +3678,13 @@@ __wsum skb_checksum(const struct sk_buf
                    __wsum csum);
  
  static inline void * __must_check
 -__skb_header_pointer(const struct sk_buff *skb, int offset,
 -                   int len, void *data, int hlen, void *buffer)
 +__skb_header_pointer(const struct sk_buff *skb, int offset, int len,
 +                   const void *data, int hlen, void *buffer)
  {
 -      if (hlen - offset >= len)
 -              return data + offset;
 +      if (likely(hlen - offset >= len))
 +              return (void *)data + offset;
  
 -      if (!skb ||
 -          skb_copy_bits(skb, offset, buffer, len) < 0)
 +      if (!skb || unlikely(skb_copy_bits(skb, offset, buffer, len) < 0))
                return NULL;
  
        return buffer;
@@@ -1498,16 -1498,13 +1498,16 @@@ struct nft_trans_chain 
  
  struct nft_trans_table {
        bool                            update;
 -      bool                            enable;
 +      u8                              state;
 +      u32                             flags;
  };
  
  #define nft_trans_table_update(trans) \
        (((struct nft_trans_table *)trans->data)->update)
 -#define nft_trans_table_enable(trans) \
 -      (((struct nft_trans_table *)trans->data)->enable)
 +#define nft_trans_table_state(trans)  \
 +      (((struct nft_trans_table *)trans->data)->state)
 +#define nft_trans_table_flags(trans)  \
 +      (((struct nft_trans_table *)trans->data)->flags)
  
  struct nft_trans_elem {
        struct nft_set                  *set;
@@@ -1539,6 -1536,7 +1539,7 @@@ struct nft_trans_flowtable 
        struct nft_flowtable            *flowtable;
        bool                            update;
        struct list_head                hook_list;
+       u32                             flags;
  };
  
  #define nft_trans_flowtable(trans)    \
        (((struct nft_trans_flowtable *)trans->data)->update)
  #define nft_trans_flowtable_hooks(trans)      \
        (((struct nft_trans_flowtable *)trans->data)->hook_list)
+ #define nft_trans_flowtable_flags(trans)      \
+       (((struct nft_trans_flowtable *)trans->data)->flags)
  
  int __init nft_chain_filter_init(void);
  void nft_chain_filter_fini(void);
diff --combined include/net/nexthop.h
@@@ -40,12 -40,6 +40,12 @@@ struct nh_config 
  
        struct nlattr   *nh_grp;
        u16             nh_grp_type;
 +      u16             nh_grp_res_num_buckets;
 +      unsigned long   nh_grp_res_idle_timer;
 +      unsigned long   nh_grp_res_unbalanced_timer;
 +      bool            nh_grp_res_has_num_buckets;
 +      bool            nh_grp_res_has_idle_timer;
 +      bool            nh_grp_res_has_unbalanced_timer;
  
        struct nlattr   *nh_encap;
        u16             nh_encap_type;
@@@ -69,32 -63,6 +69,32 @@@ struct nh_info 
        };
  };
  
 +struct nh_res_bucket {
 +      struct nh_grp_entry __rcu *nh_entry;
 +      atomic_long_t           used_time;
 +      unsigned long           migrated_time;
 +      bool                    occupied;
 +      u8                      nh_flags;
 +};
 +
 +struct nh_res_table {
 +      struct net              *net;
 +      u32                     nhg_id;
 +      struct delayed_work     upkeep_dw;
 +
 +      /* List of NHGEs that have too few buckets ("uw" for underweight).
 +       * Reclaimed buckets will be given to entries in this list.
 +       */
 +      struct list_head        uw_nh_entries;
 +      unsigned long           unbalanced_since;
 +
 +      u32                     idle_timer;
 +      u32                     unbalanced_timer;
 +
 +      u16                     num_nh_buckets;
 +      struct nh_res_bucket    nh_buckets[];
 +};
 +
  struct nh_grp_entry {
        struct nexthop  *nh;
        u8              weight;
                struct {
                        atomic_t        upper_bound;
                } mpath;
 +              struct {
 +                      /* Member on uw_nh_entries. */
 +                      struct list_head        uw_nh_entry;
 +
 +                      u16                     count_buckets;
 +                      u16                     wants_buckets;
 +              } res;
        };
  
        struct list_head nh_list;
  struct nh_group {
        struct nh_group         *spare; /* spare group for removals */
        u16                     num_nh;
 +      bool                    is_multipath;
        bool                    mpath;
 +      bool                    resilient;
        bool                    fdb_nh;
        bool                    has_v4;
 +
 +      struct nh_res_table __rcu *res_table;
        struct nh_grp_entry     nh_entries[];
  };
  
@@@ -155,15 -112,11 +155,15 @@@ struct nexthop 
  enum nexthop_event_type {
        NEXTHOP_EVENT_DEL,
        NEXTHOP_EVENT_REPLACE,
 +      NEXTHOP_EVENT_RES_TABLE_PRE_REPLACE,
 +      NEXTHOP_EVENT_BUCKET_REPLACE,
  };
  
  enum nh_notifier_info_type {
        NH_NOTIFIER_INFO_TYPE_SINGLE,
        NH_NOTIFIER_INFO_TYPE_GRP,
 +      NH_NOTIFIER_INFO_TYPE_RES_TABLE,
 +      NH_NOTIFIER_INFO_TYPE_RES_BUCKET,
  };
  
  struct nh_notifier_single_info {
@@@ -190,19 -143,6 +190,19 @@@ struct nh_notifier_grp_info 
        struct nh_notifier_grp_entry_info nh_entries[];
  };
  
 +struct nh_notifier_res_bucket_info {
 +      u16 bucket_index;
 +      unsigned int idle_timer_ms;
 +      bool force;
 +      struct nh_notifier_single_info old_nh;
 +      struct nh_notifier_single_info new_nh;
 +};
 +
 +struct nh_notifier_res_table_info {
 +      u16 num_nh_buckets;
 +      struct nh_notifier_single_info nhs[];
 +};
 +
  struct nh_notifier_info {
        struct net *net;
        struct netlink_ext_ack *extack;
        union {
                struct nh_notifier_single_info *nh;
                struct nh_notifier_grp_info *nh_grp;
 +              struct nh_notifier_res_table_info *nh_res_table;
 +              struct nh_notifier_res_bucket_info *nh_res_bucket;
        };
  };
  
@@@ -220,10 -158,6 +220,10 @@@ int register_nexthop_notifier(struct ne
                              struct netlink_ext_ack *extack);
  int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb);
  void nexthop_set_hw_flags(struct net *net, u32 id, bool offload, bool trap);
 +void nexthop_bucket_set_hw_flags(struct net *net, u32 id, u16 bucket_index,
 +                               bool offload, bool trap);
 +void nexthop_res_grp_activity_update(struct net *net, u32 id, u16 num_buckets,
 +                                   unsigned long *activity);
  
  /* caller is holding rcu or rtnl; no reference taken to nexthop */
  struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
@@@ -278,7 -212,7 +278,7 @@@ static inline bool nexthop_is_multipath
                struct nh_group *nh_grp;
  
                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
 -              return nh_grp->mpath;
 +              return nh_grp->is_multipath;
        }
        return false;
  }
@@@ -293,7 -227,7 +293,7 @@@ static inline unsigned int nexthop_num_
                struct nh_group *nh_grp;
  
                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
 -              if (nh_grp->mpath)
 +              if (nh_grp->is_multipath)
                        rc = nh_grp->num_nh;
        }
  
@@@ -374,7 -308,7 +374,7 @@@ struct fib_nh_common *nexthop_fib_nhc(s
                struct nh_group *nh_grp;
  
                nh_grp = rcu_dereference_rtnl(nh->nh_grp);
 -              if (nh_grp->mpath) {
 +              if (nh_grp->is_multipath) {
                        nh = nexthop_mpath_select(nh_grp, nhsel);
                        if (!nh)
                                return NULL;
@@@ -476,6 -410,7 +476,7 @@@ static inline struct fib_nh *fib_info_n
  int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
                       struct netlink_ext_ack *extack);
  
+ /* Caller should either hold rcu_read_lock(), or RTNL. */
  static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
  {
        struct nh_info *nhi;
        return NULL;
  }
  
+ /* Variant of nexthop_fib6_nh().
+  * Caller should either hold rcu_read_lock_bh(), or RTNL.
+  */
+ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh)
+ {
+       struct nh_info *nhi;
+       if (nh->is_group) {
+               struct nh_group *nh_grp;
+               nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp);
+               nh = nexthop_mpath_select(nh_grp, 0);
+               if (!nh)
+                       return NULL;
+       }
+       nhi = rcu_dereference_bh_rtnl(nh->nh_info);
+       if (nhi->family == AF_INET6)
+               return &nhi->fib6_nh;
+       return NULL;
+ }
  static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i)
  {
        struct fib6_nh *fib6_nh;
diff --combined include/uapi/linux/bpf.h
@@@ -93,717 -93,7 +93,717 @@@ union bpf_iter_link_info 
        } map;
  };
  
 -/* BPF syscall commands, see bpf(2) man-page for details. */
 +/* BPF syscall commands, see bpf(2) man-page for more details. */
 +/**
 + * DOC: eBPF Syscall Preamble
 + *
 + * The operation to be performed by the **bpf**\ () system call is determined
 + * by the *cmd* argument. Each operation takes an accompanying argument,
 + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see
 + * below). The size argument is the size of the union pointed to by *attr*.
 + */
 +/**
 + * DOC: eBPF Syscall Commands
 + *
 + * BPF_MAP_CREATE
 + *    Description
 + *            Create a map and return a file descriptor that refers to the
 + *            map. The close-on-exec file descriptor flag (see **fcntl**\ (2))
 + *            is automatically enabled for the new file descriptor.
 + *
 + *            Applying **close**\ (2) to the file descriptor returned by
 + *            **BPF_MAP_CREATE** will delete the map (but see NOTES).
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_MAP_LOOKUP_ELEM
 + *    Description
 + *            Look up an element with a given *key* in the map referred to
 + *            by the file descriptor *map_fd*.
 + *
 + *            The *flags* argument may be specified as one of the
 + *            following:
 + *
 + *            **BPF_F_LOCK**
 + *                    Look up the value of a spin-locked map without
 + *                    returning the lock. This must be specified if the
 + *                    elements contain a spinlock.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_MAP_UPDATE_ELEM
 + *    Description
 + *            Create or update an element (key/value pair) in a specified map.
 + *
 + *            The *flags* argument should be specified as one of the
 + *            following:
 + *
 + *            **BPF_ANY**
 + *                    Create a new element or update an existing element.
 + *            **BPF_NOEXIST**
 + *                    Create a new element only if it did not exist.
 + *            **BPF_EXIST**
 + *                    Update an existing element.
 + *            **BPF_F_LOCK**
 + *                    Update a spin_lock-ed map element.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + *            May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**,
 + *            **E2BIG**, **EEXIST**, or **ENOENT**.
 + *
 + *            **E2BIG**
 + *                    The number of elements in the map reached the
 + *                    *max_entries* limit specified at map creation time.
 + *            **EEXIST**
 + *                    If *flags* specifies **BPF_NOEXIST** and the element
 + *                    with *key* already exists in the map.
 + *            **ENOENT**
 + *                    If *flags* specifies **BPF_EXIST** and the element with
 + *                    *key* does not exist in the map.
 + *
 + * BPF_MAP_DELETE_ELEM
 + *    Description
 + *            Look up and delete an element by key in a specified map.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_MAP_GET_NEXT_KEY
 + *    Description
 + *            Look up an element by key in a specified map and return the key
 + *            of the next element. Can be used to iterate over all elements
 + *            in the map.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + *            The following cases can be used to iterate over all elements of
 + *            the map:
 + *
 + *            * If *key* is not found, the operation returns zero and sets
 + *              the *next_key* pointer to the key of the first element.
 + *            * If *key* is found, the operation returns zero and sets the
 + *              *next_key* pointer to the key of the next element.
 + *            * If *key* is the last element, returns -1 and *errno* is set
 + *              to **ENOENT**.
 + *
 + *            May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or
 + *            **EINVAL** on error.
 + *
 + * BPF_PROG_LOAD
 + *    Description
 + *            Verify and load an eBPF program, returning a new file
 + *            descriptor associated with the program.
 + *
 + *            Applying **close**\ (2) to the file descriptor returned by
 + *            **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES).
 + *
 + *            The close-on-exec file descriptor flag (see **fcntl**\ (2)) is
 + *            automatically enabled for the new file descriptor.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_OBJ_PIN
 + *    Description
 + *            Pin an eBPF program or map referred by the specified *bpf_fd*
 + *            to the provided *pathname* on the filesystem.
 + *
 + *            The *pathname* argument must not contain a dot (".").
 + *
 + *            On success, *pathname* retains a reference to the eBPF object,
 + *            preventing deallocation of the object when the original
 + *            *bpf_fd* is closed. This allow the eBPF object to live beyond
 + *            **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent
 + *            process.
 + *
 + *            Applying **unlink**\ (2) or similar calls to the *pathname*
 + *            unpins the object from the filesystem, removing the reference.
 + *            If no other file descriptors or filesystem nodes refer to the
 + *            same object, it will be deallocated (see NOTES).
 + *
 + *            The filesystem type for the parent directory of *pathname* must
 + *            be **BPF_FS_MAGIC**.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_OBJ_GET
 + *    Description
 + *            Open a file descriptor for the eBPF object pinned to the
 + *            specified *pathname*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_PROG_ATTACH
 + *    Description
 + *            Attach an eBPF program to a *target_fd* at the specified
 + *            *attach_type* hook.
 + *
 + *            The *attach_type* specifies the eBPF attachment point to
 + *            attach the program to, and must be one of *bpf_attach_type*
 + *            (see below).
 + *
 + *            The *attach_bpf_fd* must be a valid file descriptor for a
 + *            loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap
 + *            or sock_ops type corresponding to the specified *attach_type*.
 + *
 + *            The *target_fd* must be a valid file descriptor for a kernel
 + *            object which depends on the attach type of *attach_bpf_fd*:
 + *
 + *            **BPF_PROG_TYPE_CGROUP_DEVICE**,
 + *            **BPF_PROG_TYPE_CGROUP_SKB**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCK**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
 + *            **BPF_PROG_TYPE_CGROUP_SYSCTL**,
 + *            **BPF_PROG_TYPE_SOCK_OPS**
 + *
 + *                    Control Group v2 hierarchy with the eBPF controller
 + *                    enabled. Requires the kernel to be compiled with
 + *                    **CONFIG_CGROUP_BPF**.
 + *
 + *            **BPF_PROG_TYPE_FLOW_DISSECTOR**
 + *
 + *                    Network namespace (eg /proc/self/ns/net).
 + *
 + *            **BPF_PROG_TYPE_LIRC_MODE2**
 + *
 + *                    LIRC device path (eg /dev/lircN). Requires the kernel
 + *                    to be compiled with **CONFIG_BPF_LIRC_MODE2**.
 + *
 + *            **BPF_PROG_TYPE_SK_SKB**,
 + *            **BPF_PROG_TYPE_SK_MSG**
 + *
 + *                    eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**).
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_PROG_DETACH
 + *    Description
 + *            Detach the eBPF program associated with the *target_fd* at the
 + *            hook specified by *attach_type*. The program must have been
 + *            previously attached using **BPF_PROG_ATTACH**.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_PROG_TEST_RUN
 + *    Description
 + *            Run the eBPF program associated with the *prog_fd* a *repeat*
 + *            number of times against a provided program context *ctx_in* and
 + *            data *data_in*, and return the modified program context
 + *            *ctx_out*, *data_out* (for example, packet data), result of the
 + *            execution *retval*, and *duration* of the test run.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + *            **ENOSPC**
 + *                    Either *data_size_out* or *ctx_size_out* is too small.
 + *            **ENOTSUPP**
 + *                    This command is not supported by the program type of
 + *                    the program referred to by *prog_fd*.
 + *
 + * BPF_PROG_GET_NEXT_ID
 + *    Description
 + *            Fetch the next eBPF program currently loaded into the kernel.
 + *
 + *            Looks for the eBPF program with an id greater than *start_id*
 + *            and updates *next_id* on success. If no other eBPF programs
 + *            remain with ids higher than *start_id*, returns -1 and sets
 + *            *errno* to **ENOENT**.
 + *
 + *    Return
 + *            Returns zero on success. On error, or when no id remains, -1
 + *            is returned and *errno* is set appropriately.
 + *
 + * BPF_MAP_GET_NEXT_ID
 + *    Description
 + *            Fetch the next eBPF map currently loaded into the kernel.
 + *
 + *            Looks for the eBPF map with an id greater than *start_id*
 + *            and updates *next_id* on success. If no other eBPF maps
 + *            remain with ids higher than *start_id*, returns -1 and sets
 + *            *errno* to **ENOENT**.
 + *
 + *    Return
 + *            Returns zero on success. On error, or when no id remains, -1
 + *            is returned and *errno* is set appropriately.
 + *
 + * BPF_PROG_GET_FD_BY_ID
 + *    Description
 + *            Open a file descriptor for the eBPF program corresponding to
 + *            *prog_id*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_MAP_GET_FD_BY_ID
 + *    Description
 + *            Open a file descriptor for the eBPF map corresponding to
 + *            *map_id*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_OBJ_GET_INFO_BY_FD
 + *    Description
 + *            Obtain information about the eBPF object corresponding to
 + *            *bpf_fd*.
 + *
 + *            Populates up to *info_len* bytes of *info*, which will be in
 + *            one of the following formats depending on the eBPF object type
 + *            of *bpf_fd*:
 + *
 + *            * **struct bpf_prog_info**
 + *            * **struct bpf_map_info**
 + *            * **struct bpf_btf_info**
 + *            * **struct bpf_link_info**
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_PROG_QUERY
 + *    Description
 + *            Obtain information about eBPF programs associated with the
 + *            specified *attach_type* hook.
 + *
 + *            The *target_fd* must be a valid file descriptor for a kernel
 + *            object which depends on the attach type of *attach_bpf_fd*:
 + *
 + *            **BPF_PROG_TYPE_CGROUP_DEVICE**,
 + *            **BPF_PROG_TYPE_CGROUP_SKB**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCK**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**,
 + *            **BPF_PROG_TYPE_CGROUP_SOCKOPT**,
 + *            **BPF_PROG_TYPE_CGROUP_SYSCTL**,
 + *            **BPF_PROG_TYPE_SOCK_OPS**
 + *
 + *                    Control Group v2 hierarchy with the eBPF controller
 + *                    enabled. Requires the kernel to be compiled with
 + *                    **CONFIG_CGROUP_BPF**.
 + *
 + *            **BPF_PROG_TYPE_FLOW_DISSECTOR**
 + *
 + *                    Network namespace (eg /proc/self/ns/net).
 + *
 + *            **BPF_PROG_TYPE_LIRC_MODE2**
 + *
 + *                    LIRC device path (eg /dev/lircN). Requires the kernel
 + *                    to be compiled with **CONFIG_BPF_LIRC_MODE2**.
 + *
 + *            **BPF_PROG_QUERY** always fetches the number of programs
 + *            attached and the *attach_flags* which were used to attach those
 + *            programs. Additionally, if *prog_ids* is nonzero and the number
 + *            of attached programs is less than *prog_cnt*, populates
 + *            *prog_ids* with the eBPF program ids of the programs attached
 + *            at *target_fd*.
 + *
 + *            The following flags may alter the result:
 + *
 + *            **BPF_F_QUERY_EFFECTIVE**
 + *                    Only return information regarding programs which are
 + *                    currently effective at the specified *target_fd*.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_RAW_TRACEPOINT_OPEN
 + *    Description
 + *            Attach an eBPF program to a tracepoint *name* to access kernel
 + *            internal arguments of the tracepoint in their raw form.
 + *
 + *            The *prog_fd* must be a valid file descriptor associated with
 + *            a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**.
 + *
 + *            No ABI guarantees are made about the content of tracepoint
 + *            arguments exposed to the corresponding eBPF program.
 + *
 + *            Applying **close**\ (2) to the file descriptor returned by
 + *            **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES).
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_BTF_LOAD
 + *    Description
 + *            Verify and load BPF Type Format (BTF) metadata into the kernel,
 + *            returning a new file descriptor associated with the metadata.
 + *            BTF is described in more detail at
 + *            https://www.kernel.org/doc/html/latest/bpf/btf.html.
 + *
 + *            The *btf* parameter must point to valid memory providing
 + *            *btf_size* bytes of BTF binary metadata.
 + *
 + *            The returned file descriptor can be passed to other **bpf**\ ()
 + *            subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to
 + *            associate the BTF with those objects.
 + *
 + *            Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional
 + *            parameters to specify a *btf_log_buf*, *btf_log_size* and
 + *            *btf_log_level* which allow the kernel to return freeform log
 + *            output regarding the BTF verification process.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_BTF_GET_FD_BY_ID
 + *    Description
 + *            Open a file descriptor for the BPF Type Format (BTF)
 + *            corresponding to *btf_id*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_TASK_FD_QUERY
 + *    Description
 + *            Obtain information about eBPF programs associated with the
 + *            target process identified by *pid* and *fd*.
 + *
 + *            If the *pid* and *fd* are associated with a tracepoint, kprobe
 + *            or uprobe perf event, then the *prog_id* and *fd_type* will
 + *            be populated with the eBPF program id and file descriptor type
 + *            of type **bpf_task_fd_type**. If associated with a kprobe or
 + *            uprobe, the  *probe_offset* and *probe_addr* will also be
 + *            populated. Optionally, if *buf* is provided, then up to
 + *            *buf_len* bytes of *buf* will be populated with the name of
 + *            the tracepoint, kprobe or uprobe.
 + *
 + *            The resulting *prog_id* may be introspected in deeper detail
 + *            using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_MAP_LOOKUP_AND_DELETE_ELEM
 + *    Description
 + *            Look up an element with the given *key* in the map referred to
 + *            by the file descriptor *fd*, and if found, delete the element.
 + *
 + *            The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types
 + *            implement this command as a "pop" operation, deleting the top
 + *            element rather than one corresponding to *key*.
 + *            The *key* and *key_len* parameters should be zeroed when
 + *            issuing this operation for these map types.
 + *
 + *            This command is only valid for the following map types:
 + *            * **BPF_MAP_TYPE_QUEUE**
 + *            * **BPF_MAP_TYPE_STACK**
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_MAP_FREEZE
 + *    Description
 + *            Freeze the permissions of the specified map.
 + *
 + *            Write permissions may be frozen by passing zero *flags*.
 + *            Upon success, no future syscall invocations may alter the
 + *            map state of *map_fd*. Write operations from eBPF programs
 + *            are still possible for a frozen map.
 + *
 + *            Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_BTF_GET_NEXT_ID
 + *    Description
 + *            Fetch the next BPF Type Format (BTF) object currently loaded
 + *            into the kernel.
 + *
 + *            Looks for the BTF object with an id greater than *start_id*
 + *            and updates *next_id* on success. If no other BTF objects
 + *            remain with ids higher than *start_id*, returns -1 and sets
 + *            *errno* to **ENOENT**.
 + *
 + *    Return
 + *            Returns zero on success. On error, or when no id remains, -1
 + *            is returned and *errno* is set appropriately.
 + *
 + * BPF_MAP_LOOKUP_BATCH
 + *    Description
 + *            Iterate and fetch multiple elements in a map.
 + *
 + *            Two opaque values are used to manage batch operations,
 + *            *in_batch* and *out_batch*. Initially, *in_batch* must be set
 + *            to NULL to begin the batched operation. After each subsequent
 + *            **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant
 + *            *out_batch* as the *in_batch* for the next operation to
 + *            continue iteration from the current point.
 + *
 + *            The *keys* and *values* are output parameters which must point
 + *            to memory large enough to hold *count* items based on the key
 + *            and value size of the map *map_fd*. The *keys* buffer must be
 + *            of *key_size* * *count*. The *values* buffer must be of
 + *            *value_size* * *count*.
 + *
 + *            The *elem_flags* argument may be specified as one of the
 + *            following:
 + *
 + *            **BPF_F_LOCK**
 + *                    Look up the value of a spin-locked map without
 + *                    returning the lock. This must be specified if the
 + *                    elements contain a spinlock.
 + *
 + *            On success, *count* elements from the map are copied into the
 + *            user buffer, with the keys copied into *keys* and the values
 + *            copied into the corresponding indices in *values*.
 + *
 + *            If an error is returned and *errno* is not **EFAULT**, *count*
 + *            is set to the number of successfully processed elements.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + *            May set *errno* to **ENOSPC** to indicate that *keys* or
 + *            *values* is too small to dump an entire bucket during
 + *            iteration of a hash-based map type.
 + *
 + * BPF_MAP_LOOKUP_AND_DELETE_BATCH
 + *    Description
 + *            Iterate and delete all elements in a map.
 + *
 + *            This operation has the same behavior as
 + *            **BPF_MAP_LOOKUP_BATCH** with two exceptions:
 + *
 + *            * Every element that is successfully returned is also deleted
 + *              from the map. This is at least *count* elements. Note that
 + *              *count* is both an input and an output parameter.
 + *            * Upon returning with *errno* set to **EFAULT**, up to
 + *              *count* elements may be deleted without returning the keys
 + *              and values of the deleted elements.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_MAP_UPDATE_BATCH
 + *    Description
 + *            Update multiple elements in a map by *key*.
 + *
 + *            The *keys* and *values* are input parameters which must point
 + *            to memory large enough to hold *count* items based on the key
 + *            and value size of the map *map_fd*. The *keys* buffer must be
 + *            of *key_size* * *count*. The *values* buffer must be of
 + *            *value_size* * *count*.
 + *
 + *            Each element specified in *keys* is sequentially updated to the
 + *            value in the corresponding index in *values*. The *in_batch*
 + *            and *out_batch* parameters are ignored and should be zeroed.
 + *
 + *            The *elem_flags* argument should be specified as one of the
 + *            following:
 + *
 + *            **BPF_ANY**
 + *                    Create new elements or update a existing elements.
 + *            **BPF_NOEXIST**
 + *                    Create new elements only if they do not exist.
 + *            **BPF_EXIST**
 + *                    Update existing elements.
 + *            **BPF_F_LOCK**
 + *                    Update spin_lock-ed map elements. This must be
 + *                    specified if the map value contains a spinlock.
 + *
 + *            On success, *count* elements from the map are updated.
 + *
 + *            If an error is returned and *errno* is not **EFAULT**, *count*
 + *            is set to the number of successfully processed elements.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + *            May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or
 + *            **E2BIG**. **E2BIG** indicates that the number of elements in
 + *            the map reached the *max_entries* limit specified at map
 + *            creation time.
 + *
 + *            May set *errno* to one of the following error codes under
 + *            specific circumstances:
 + *
 + *            **EEXIST**
 + *                    If *flags* specifies **BPF_NOEXIST** and the element
 + *                    with *key* already exists in the map.
 + *            **ENOENT**
 + *                    If *flags* specifies **BPF_EXIST** and the element with
 + *                    *key* does not exist in the map.
 + *
 + * BPF_MAP_DELETE_BATCH
 + *    Description
 + *            Delete multiple elements in a map by *key*.
 + *
 + *            The *keys* parameter is an input parameter which must point
 + *            to memory large enough to hold *count* items based on the key
 + *            size of the map *map_fd*, that is, *key_size* * *count*.
 + *
 + *            Each element specified in *keys* is sequentially deleted. The
 + *            *in_batch*, *out_batch*, and *values* parameters are ignored
 + *            and should be zeroed.
 + *
 + *            The *elem_flags* argument may be specified as one of the
 + *            following:
 + *
 + *            **BPF_F_LOCK**
 + *                    Look up the value of a spin-locked map without
 + *                    returning the lock. This must be specified if the
 + *                    elements contain a spinlock.
 + *
 + *            On success, *count* elements from the map are updated.
 + *
 + *            If an error is returned and *errno* is not **EFAULT**, *count*
 + *            is set to the number of successfully processed elements. If
 + *            *errno* is **EFAULT**, up to *count* elements may be been
 + *            deleted.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_LINK_CREATE
 + *    Description
 + *            Attach an eBPF program to a *target_fd* at the specified
 + *            *attach_type* hook and return a file descriptor handle for
 + *            managing the link.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_LINK_UPDATE
 + *    Description
 + *            Update the eBPF program in the specified *link_fd* to
 + *            *new_prog_fd*.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_LINK_GET_FD_BY_ID
 + *    Description
 + *            Open a file descriptor for the eBPF Link corresponding to
 + *            *link_id*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_LINK_GET_NEXT_ID
 + *    Description
 + *            Fetch the next eBPF link currently loaded into the kernel.
 + *
 + *            Looks for the eBPF link with an id greater than *start_id*
 + *            and updates *next_id* on success. If no other eBPF links
 + *            remain with ids higher than *start_id*, returns -1 and sets
 + *            *errno* to **ENOENT**.
 + *
 + *    Return
 + *            Returns zero on success. On error, or when no id remains, -1
 + *            is returned and *errno* is set appropriately.
 + *
 + * BPF_ENABLE_STATS
 + *    Description
 + *            Enable eBPF runtime statistics gathering.
 + *
 + *            Runtime statistics gathering for the eBPF runtime is disabled
 + *            by default to minimize the corresponding performance overhead.
 + *            This command enables statistics globally.
 + *
 + *            Multiple programs may independently enable statistics.
 + *            After gathering the desired statistics, eBPF runtime statistics
 + *            may be disabled again by calling **close**\ (2) for the file
 + *            descriptor returned by this function. Statistics will only be
 + *            disabled system-wide when all outstanding file descriptors
 + *            returned by prior calls for this subcommand are closed.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_ITER_CREATE
 + *    Description
 + *            Create an iterator on top of the specified *link_fd* (as
 + *            previously created using **BPF_LINK_CREATE**) and return a
 + *            file descriptor that can be used to trigger the iteration.
 + *
 + *            If the resulting file descriptor is pinned to the filesystem
 + *            using  **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls
 + *            for that path will trigger the iterator to read kernel state
 + *            using the eBPF program attached to *link_fd*.
 + *
 + *    Return
 + *            A new file descriptor (a nonnegative integer), or -1 if an
 + *            error occurred (in which case, *errno* is set appropriately).
 + *
 + * BPF_LINK_DETACH
 + *    Description
 + *            Forcefully detach the specified *link_fd* from its
 + *            corresponding attachment point.
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * BPF_PROG_BIND_MAP
 + *    Description
 + *            Bind a map to the lifetime of an eBPF program.
 + *
 + *            The map identified by *map_fd* is bound to the program
 + *            identified by *prog_fd* and only released when *prog_fd* is
 + *            released. This may be used in cases where metadata should be
 + *            associated with a program which otherwise does not contain any
 + *            references to the map (for example, embedded in the eBPF
 + *            program instructions).
 + *
 + *    Return
 + *            Returns zero on success. On error, -1 is returned and *errno*
 + *            is set appropriately.
 + *
 + * NOTES
 + *    eBPF objects (maps and programs) can be shared between processes.
 + *
 + *    * After **fork**\ (2), the child inherits file descriptors
 + *      referring to the same eBPF objects.
 + *    * File descriptors referring to eBPF objects can be transferred over
 + *      **unix**\ (7) domain sockets.
 + *    * File descriptors referring to eBPF objects can be duplicated in the
 + *      usual way, using **dup**\ (2) and similar calls.
 + *    * File descriptors referring to eBPF objects can be pinned to the
 + *      filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2).
 + *
 + *    An eBPF object is deallocated only after all file descriptors referring
 + *    to the object have been closed and no references remain pinned to the
 + *    filesystem or attached (for example, bound to a program or device).
 + */
  enum bpf_cmd {
        BPF_MAP_CREATE,
        BPF_MAP_LOOKUP_ELEM,
@@@ -1103,15 -393,6 +1103,15 @@@ enum bpf_link_type 
   *                   is struct/union.
   */
  #define BPF_PSEUDO_BTF_ID     3
 +/* insn[0].src_reg:  BPF_PSEUDO_FUNC
 + * insn[0].imm:      insn offset to the func
 + * insn[1].imm:      0
 + * insn[0].off:      0
 + * insn[1].off:      0
 + * ldimm64 rewrite:  address of the function
 + * verifier type:    PTR_TO_FUNC.
 + */
 +#define BPF_PSEUDO_FUNC               4
  
  /* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative
   * offset to another bpf function
@@@ -1439,7 -720,7 +1439,7 @@@ union bpf_attr 
   * parsed and used to produce a manual page. The workflow is the following,
   * and requires the rst2man utility:
   *
 - *     $ ./scripts/bpf_helpers_doc.py \
 + *     $ ./scripts/bpf_doc.py \
   *             --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst
   *     $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7
   *     $ man /tmp/bpf-helpers.7
   *              Use with ENCAP_L3/L4 flags to further specify the tunnel
   *              type; *len* is the length of the inner MAC header.
   *
 + *            * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**:
 + *              Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the
 + *              L2 type as Ethernet.
 + *
   *            A call to this helper is susceptible to change the underlying
   *            packet buffer. Therefore, at load time, all checks on pointers
   *            previously done by the verifier are invalidated and must be
   *
   * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags)
   *    Description
-  *            Check ctx packet size against exceeding MTU of net device (based
+  *            Check packet size against exceeding MTU of net device (based
   *            on *ifindex*).  This helper will likely be used in combination
   *            with helpers that adjust/change the packet size.
   *
   *            against the current net device.  This is practical if this isn't
   *            used prior to redirect.
   *
+  *            On input *mtu_len* must be a valid pointer, else verifier will
+  *            reject BPF program.  If the value *mtu_len* is initialized to
+  *            zero then the ctx packet size is use.  When value *mtu_len* is
+  *            provided as input this specify the L3 length that the MTU check
+  *            is done against. Remember XDP and TC length operate at L2, but
+  *            this value is L3 as this correlate to MTU and IP-header tot_len
+  *            values which are L3 (similar behavior as bpf_fib_lookup).
+  *
   *            The Linux kernel route table can configure MTUs on a more
   *            specific per route level, which is not provided by this helper.
   *            For route level MTU checks use the **bpf_fib_lookup**\ ()
   *
   *            On return *mtu_len* pointer contains the MTU value of the net
   *            device.  Remember the net device configured MTU is the L3 size,
-  *            which is returned here and XDP and TX length operate at L2.
+  *            which is returned here and XDP and TC length operate at L2.
   *            Helper take this into account for you, but remember when using
-  *            MTU value in your BPF-code.  On input *mtu_len* must be a valid
-  *            pointer and be initialized (to zero), else verifier will reject
-  *            BPF program.
+  *            MTU value in your BPF-code.
   *
   *    Return
   *            * 0 on success, and populate MTU value in *mtu_len* pointer.
   *            * **BPF_MTU_CHK_RET_FRAG_NEEDED**
   *            * **BPF_MTU_CHK_RET_SEGS_TOOBIG**
   *
 + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags)
 + *    Description
 + *            For each element in **map**, call **callback_fn** function with
 + *            **map**, **callback_ctx** and other map-specific parameters.
 + *            The **callback_fn** should be a static function and
 + *            the **callback_ctx** should be a pointer to the stack.
 + *            The **flags** is used to control certain aspects of the helper.
 + *            Currently, the **flags** must be 0.
 + *
 + *            The following are a list of supported map types and their
 + *            respective expected callback signatures:
 + *
 + *            BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH,
 + *            BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH,
 + *            BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY
 + *
 + *            long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx);
 + *
 + *            For per_cpu maps, the map_value is the value on the cpu where the
 + *            bpf_prog is running.
 + *
 + *            If **callback_fn** return 0, the helper will continue to the next
 + *            element. If return value is 1, the helper will skip the rest of
 + *            elements and return. Other return values are not used now.
 + *
 + *    Return
 + *            The number of traversed map elements for success, **-EINVAL** for
 + *            invalid **flags**.
   */
  #define __BPF_FUNC_MAPPER(FN)         \
        FN(unspec),                     \
        FN(ima_inode_hash),             \
        FN(sock_from_file),             \
        FN(check_mtu),                  \
 +      FN(for_each_map_elem),          \
        /* */
  
  /* integer value in 'imm' field of BPF_CALL instruction selects which helper
@@@ -4920,7 -4174,6 +4926,7 @@@ enum 
        BPF_F_ADJ_ROOM_ENCAP_L4_GRE     = (1ULL << 3),
        BPF_F_ADJ_ROOM_ENCAP_L4_UDP     = (1ULL << 4),
        BPF_F_ADJ_ROOM_NO_CSUM_RESET    = (1ULL << 5),
 +      BPF_F_ADJ_ROOM_ENCAP_L2_ETH     = (1ULL << 6),
  };
  
  enum {
@@@ -5958,10 -5211,7 +5964,10 @@@ struct bpf_pidns_info 
  
  /* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
  struct bpf_sk_lookup {
 -      __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
 +      union {
 +              __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
 +              __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */
 +      };
  
        __u32 family;           /* Protocol family (AF_INET, AF_INET6) */
        __u32 protocol;         /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
@@@ -3,7 -3,6 +3,6 @@@
  #define __UAPI_PSAMPLE_H
  
  enum {
-       /* sampled packet metadata */
        PSAMPLE_ATTR_IIFINDEX,
        PSAMPLE_ATTR_OIFINDEX,
        PSAMPLE_ATTR_ORIGSIZE,
        PSAMPLE_ATTR_GROUP_SEQ,
        PSAMPLE_ATTR_SAMPLE_RATE,
        PSAMPLE_ATTR_DATA,
-       PSAMPLE_ATTR_TUNNEL,
-       /* commands attributes */
        PSAMPLE_ATTR_GROUP_REFCOUNT,
+       PSAMPLE_ATTR_TUNNEL,
  
 +      PSAMPLE_ATTR_PAD,
 +      PSAMPLE_ATTR_OUT_TC,            /* u16 */
 +      PSAMPLE_ATTR_OUT_TC_OCC,        /* u64, bytes */
 +      PSAMPLE_ATTR_LATENCY,           /* u64, nanoseconds */
 +      PSAMPLE_ATTR_TIMESTAMP,         /* u64, nanoseconds */
 +      PSAMPLE_ATTR_PROTO,             /* u16 */
 +
        __PSAMPLE_ATTR_MAX
  };
  
diff --combined init/Kconfig
@@@ -20,10 -20,10 +20,10 @@@ config CC_VERSION_TEX
            When the compiler is updated, Kconfig will be invoked.
  
          - Ensure full rebuild when the compiler is updated
-           include/linux/kconfig.h contains this option in the comment line so
-           fixdep adds include/config/cc/version/text.h into the auto-generated
-           dependency. When the compiler is updated, syncconfig will touch it
-           and then every file will be rebuilt.
+           include/linux/compiler-version.h contains this option in the comment
+           line so fixdep adds include/config/cc/version/text.h into the
+           auto-generated dependency. When the compiler is updated, syncconfig
+           will touch it and then every file will be rebuilt.
  
  config CC_IS_GCC
        def_bool $(success,test "$(cc-name)" = GCC)
@@@ -119,8 -119,7 +119,7 @@@ config INIT_ENV_ARG_LIMI
  
  config COMPILE_TEST
        bool "Compile also drivers which will not load"
-       depends on !UML && !S390
-       default n
+       depends on HAS_IOMEM
        help
          Some drivers can be compiled on a different platform than they are
          intended to be run on. Despite they cannot be loaded there (or even
@@@ -1709,7 -1708,6 +1708,7 @@@ config BPF_SYSCAL
        select BPF
        select IRQ_WORK
        select TASKS_TRACE_RCU
 +      select NET_SOCK_MSG if INET
        default n
        help
          Enable the bpf() system call that allows to manipulate eBPF
@@@ -109,7 -109,7 +109,7 @@@ static void *bpf_fd_inode_storage_looku
        fd = *(int *)key;
        f = fget_raw(fd);
        if (!f)
-               return NULL;
+               return ERR_PTR(-EBADF);
  
        sdata = inode_storage_lookup(f->f_inode, map, true);
        fput(f);
@@@ -237,7 -237,7 +237,7 @@@ static void inode_storage_map_free(stru
  
        smap = (struct bpf_local_storage_map *)map;
        bpf_local_storage_cache_idx_free(&inode_cache, smap->cache_idx);
 -      bpf_local_storage_map_free(smap);
 +      bpf_local_storage_map_free(smap, NULL);
  }
  
  static int inode_storage_map_btf_id;
diff --combined kernel/bpf/verifier.c
@@@ -234,12 -234,6 +234,12 @@@ static bool bpf_pseudo_call(const struc
               insn->src_reg == BPF_PSEUDO_CALL;
  }
  
 +static bool bpf_pseudo_func(const struct bpf_insn *insn)
 +{
 +      return insn->code == (BPF_LD | BPF_IMM | BPF_DW) &&
 +             insn->src_reg == BPF_PSEUDO_FUNC;
 +}
 +
  struct bpf_call_arg_meta {
        struct bpf_map *map_ptr;
        bool raw_mode;
        u32 btf_id;
        struct btf *ret_btf;
        u32 ret_btf_id;
 +      u32 subprogno;
  };
  
  struct btf *btf_vmlinux;
@@@ -397,24 -390,6 +397,24 @@@ __printf(3, 4) static void verbose_linf
        env->prev_linfo = linfo;
  }
  
 +static void verbose_invalid_scalar(struct bpf_verifier_env *env,
 +                                 struct bpf_reg_state *reg,
 +                                 struct tnum *range, const char *ctx,
 +                                 const char *reg_name)
 +{
 +      char tn_buf[48];
 +
 +      verbose(env, "At %s the register %s ", ctx, reg_name);
 +      if (!tnum_is_unknown(reg->var_off)) {
 +              tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 +              verbose(env, "has value %s", tn_buf);
 +      } else {
 +              verbose(env, "has unknown scalar value");
 +      }
 +      tnum_strn(tn_buf, sizeof(tn_buf), *range);
 +      verbose(env, " should have been in %s\n", tn_buf);
 +}
 +
  static bool type_is_pkt_pointer(enum bpf_reg_type type)
  {
        return type == PTR_TO_PACKET ||
@@@ -434,7 -409,6 +434,7 @@@ static bool reg_type_not_null(enum bpf_
        return type == PTR_TO_SOCKET ||
                type == PTR_TO_TCP_SOCK ||
                type == PTR_TO_MAP_VALUE ||
 +              type == PTR_TO_MAP_KEY ||
                type == PTR_TO_SOCK_COMMON;
  }
  
@@@ -477,8 -451,7 +477,8 @@@ static bool arg_type_may_be_null(enum b
               type == ARG_PTR_TO_MEM_OR_NULL ||
               type == ARG_PTR_TO_CTX_OR_NULL ||
               type == ARG_PTR_TO_SOCKET_OR_NULL ||
 -             type == ARG_PTR_TO_ALLOC_MEM_OR_NULL;
 +             type == ARG_PTR_TO_ALLOC_MEM_OR_NULL ||
 +             type == ARG_PTR_TO_STACK_OR_NULL;
  }
  
  /* Determine whether the function releases some resources allocated by another
@@@ -568,8 -541,6 +568,8 @@@ static const char * const reg_type_str[
        [PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
        [PTR_TO_RDWR_BUF]       = "rdwr_buf",
        [PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
 +      [PTR_TO_FUNC]           = "func",
 +      [PTR_TO_MAP_KEY]        = "map_key",
  };
  
  static char slot_type_char[] = {
@@@ -641,7 -612,6 +641,7 @@@ static void print_verifier_state(struc
                        if (type_is_pkt_pointer(t))
                                verbose(env, ",r=%d", reg->range);
                        else if (t == CONST_PTR_TO_MAP ||
 +                               t == PTR_TO_MAP_KEY ||
                                 t == PTR_TO_MAP_VALUE ||
                                 t == PTR_TO_MAP_VALUE_OR_NULL)
                                verbose(env, ",ks=%d,vs=%d",
@@@ -1549,7 -1519,7 +1549,7 @@@ static int add_subprog(struct bpf_verif
        }
        ret = find_subprog(env, off);
        if (ret >= 0)
 -              return 0;
 +              return ret;
        if (env->subprog_cnt >= BPF_MAX_SUBPROGS) {
                verbose(env, "too many subprograms\n");
                return -E2BIG;
        env->subprog_info[env->subprog_cnt++].start = off;
        sort(env->subprog_info, env->subprog_cnt,
             sizeof(env->subprog_info[0]), cmp_subprogs, NULL);
 -      return 0;
 +      return env->subprog_cnt - 1;
  }
  
  static int check_subprogs(struct bpf_verifier_env *env)
  
        /* determine subprog starts. The end is one before the next starts */
        for (i = 0; i < insn_cnt; i++) {
 +              if (bpf_pseudo_func(insn + i)) {
 +                      if (!env->bpf_capable) {
 +                              verbose(env,
 +                                      "function pointers are allowed for CAP_BPF and CAP_SYS_ADMIN\n");
 +                              return -EPERM;
 +                      }
 +                      ret = add_subprog(env, i + insn[i].imm + 1);
 +                      if (ret < 0)
 +                              return ret;
 +                      /* remember subprog */
 +                      insn[i + 1].imm = ret;
 +                      continue;
 +              }
                if (!bpf_pseudo_call(insn + i))
                        continue;
                if (!env->bpf_capable) {
@@@ -2338,8 -2295,6 +2338,8 @@@ static bool is_spillable_regtype(enum b
        case PTR_TO_PERCPU_BTF_ID:
        case PTR_TO_MEM:
        case PTR_TO_MEM_OR_NULL:
 +      case PTR_TO_FUNC:
 +      case PTR_TO_MAP_KEY:
                return true;
        default:
                return false;
@@@ -2944,10 -2899,6 +2944,10 @@@ static int __check_mem_access(struct bp
  
        reg = &cur_regs(env)[regno];
        switch (reg->type) {
 +      case PTR_TO_MAP_KEY:
 +              verbose(env, "invalid access to map key, key_size=%d off=%d size=%d\n",
 +                      mem_size, off, size);
 +              break;
        case PTR_TO_MAP_VALUE:
                verbose(env, "invalid access to map value, value_size=%d off=%d size=%d\n",
                        mem_size, off, size);
@@@ -3353,9 -3304,6 +3353,9 @@@ static int check_ptr_alignment(struct b
        case PTR_TO_FLOW_KEYS:
                pointer_desc = "flow keys ";
                break;
 +      case PTR_TO_MAP_KEY:
 +              pointer_desc = "key ";
 +              break;
        case PTR_TO_MAP_VALUE:
                pointer_desc = "value ";
                break;
@@@ -3457,7 -3405,7 +3457,7 @@@ process_func
  continue_func:
        subprog_end = subprog[idx + 1].start;
        for (; i < subprog_end; i++) {
 -              if (!bpf_pseudo_call(insn + i))
 +              if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i))
                        continue;
                /* remember insn and function to return to */
                ret_insn[frame] = i + 1;
@@@ -3894,19 -3842,7 +3894,19 @@@ static int check_mem_access(struct bpf_
        /* for access checks, reg->off is just part of off */
        off += reg->off;
  
 -      if (reg->type == PTR_TO_MAP_VALUE) {
 +      if (reg->type == PTR_TO_MAP_KEY) {
 +              if (t == BPF_WRITE) {
 +                      verbose(env, "write to change key R%d not allowed\n", regno);
 +                      return -EACCES;
 +              }
 +
 +              err = check_mem_region_access(env, regno, off, size,
 +                                            reg->map_ptr->key_size, false);
 +              if (err)
 +                      return err;
 +              if (value_regno >= 0)
 +                      mark_reg_unknown(env, regs, value_regno);
 +      } else if (reg->type == PTR_TO_MAP_VALUE) {
                if (t == BPF_WRITE && value_regno >= 0 &&
                    is_pointer_value(env, value_regno)) {
                        verbose(env, "R%d leaks addr into map\n", value_regno);
@@@ -4322,9 -4258,6 +4322,9 @@@ static int check_helper_mem_access(stru
        case PTR_TO_PACKET_META:
                return check_packet_access(env, regno, reg->off, access_size,
                                           zero_size_allowed);
 +      case PTR_TO_MAP_KEY:
 +              return check_mem_region_access(env, regno, reg->off, access_size,
 +                                             reg->map_ptr->key_size, false);
        case PTR_TO_MAP_VALUE:
                if (check_map_access_type(env, regno, reg->off, access_size,
                                          meta && meta->raw_mode ? BPF_WRITE :
@@@ -4541,7 -4474,6 +4541,7 @@@ static const struct bpf_reg_types map_k
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
 +              PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
        },
  };
@@@ -4573,7 -4505,6 +4573,7 @@@ static const struct bpf_reg_types mem_t
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
 +              PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
                PTR_TO_MEM,
                PTR_TO_RDONLY_BUF,
@@@ -4586,7 -4517,6 +4586,7 @@@ static const struct bpf_reg_types int_p
                PTR_TO_STACK,
                PTR_TO_PACKET,
                PTR_TO_PACKET_META,
 +              PTR_TO_MAP_KEY,
                PTR_TO_MAP_VALUE,
        },
  };
@@@ -4599,8 -4529,6 +4599,8 @@@ static const struct bpf_reg_types const
  static const struct bpf_reg_types btf_ptr_types = { .types = { PTR_TO_BTF_ID } };
  static const struct bpf_reg_types spin_lock_types = { .types = { PTR_TO_MAP_VALUE } };
  static const struct bpf_reg_types percpu_btf_ptr_types = { .types = { PTR_TO_PERCPU_BTF_ID } };
 +static const struct bpf_reg_types func_ptr_types = { .types = { PTR_TO_FUNC } };
 +static const struct bpf_reg_types stack_ptr_types = { .types = { PTR_TO_STACK } };
  
  static const struct bpf_reg_types *compatible_reg_types[__BPF_ARG_TYPE_MAX] = {
        [ARG_PTR_TO_MAP_KEY]            = &map_key_value_types,
        [ARG_PTR_TO_INT]                = &int_ptr_types,
        [ARG_PTR_TO_LONG]               = &int_ptr_types,
        [ARG_PTR_TO_PERCPU_BTF_ID]      = &percpu_btf_ptr_types,
 +      [ARG_PTR_TO_FUNC]               = &func_ptr_types,
 +      [ARG_PTR_TO_STACK_OR_NULL]      = &stack_ptr_types,
  };
  
  static int check_reg_type(struct bpf_verifier_env *env, u32 regno,
@@@ -4812,8 -4738,6 +4812,8 @@@ skip_type_check
                        verbose(env, "verifier internal error\n");
                        return -EFAULT;
                }
 +      } else if (arg_type == ARG_PTR_TO_FUNC) {
 +              meta->subprogno = reg->subprogno;
        } else if (arg_type_is_mem_ptr(arg_type)) {
                /* The access to this pointer is only checked when we hit the
                 * next is_mem_size argument below.
@@@ -5334,19 -5258,13 +5334,19 @@@ static void clear_caller_saved_regs(str
        }
  }
  
 -static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 -                         int *insn_idx)
 +typedef int (*set_callee_state_fn)(struct bpf_verifier_env *env,
 +                                 struct bpf_func_state *caller,
 +                                 struct bpf_func_state *callee,
 +                                 int insn_idx);
 +
 +static int __check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 +                           int *insn_idx, int subprog,
 +                           set_callee_state_fn set_callee_state_cb)
  {
        struct bpf_verifier_state *state = env->cur_state;
        struct bpf_func_info_aux *func_info_aux;
        struct bpf_func_state *caller, *callee;
 -      int i, err, subprog, target_insn;
 +      int err;
        bool is_global = false;
  
        if (state->curframe + 1 >= MAX_CALL_FRAMES) {
                return -E2BIG;
        }
  
 -      target_insn = *insn_idx + insn->imm;
 -      subprog = find_subprog(env, target_insn + 1);
 -      if (subprog < 0) {
 -              verbose(env, "verifier bug. No program starts at insn %d\n",
 -                      target_insn + 1);
 -              return -EFAULT;
 -      }
 -
        caller = state->frame[state->curframe];
        if (state->frame[state->curframe + 1]) {
                verbose(env, "verifier bug. Frame %d already allocated\n",
        if (err)
                return err;
  
 -      /* copy r1 - r5 args that callee can access.  The copy includes parent
 -       * pointers, which connects us up to the liveness chain
 -       */
 -      for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 -              callee->regs[i] = caller->regs[i];
 +      err = set_callee_state_cb(env, caller, callee, *insn_idx);
 +      if (err)
 +              return err;
  
        clear_caller_saved_regs(env, caller->regs);
  
        state->curframe++;
  
        /* and go analyze first insn of the callee */
 -      *insn_idx = target_insn;
 +      *insn_idx = env->subprog_info[subprog].start - 1;
  
        if (env->log.level & BPF_LOG_LEVEL) {
                verbose(env, "caller:\n");
        return 0;
  }
  
 +int map_set_for_each_callback_args(struct bpf_verifier_env *env,
 +                                 struct bpf_func_state *caller,
 +                                 struct bpf_func_state *callee)
 +{
 +      /* bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn,
 +       *      void *callback_ctx, u64 flags);
 +       * callback_fn(struct bpf_map *map, void *key, void *value,
 +       *      void *callback_ctx);
 +       */
 +      callee->regs[BPF_REG_1] = caller->regs[BPF_REG_1];
 +
 +      callee->regs[BPF_REG_2].type = PTR_TO_MAP_KEY;
 +      __mark_reg_known_zero(&callee->regs[BPF_REG_2]);
 +      callee->regs[BPF_REG_2].map_ptr = caller->regs[BPF_REG_1].map_ptr;
 +
 +      callee->regs[BPF_REG_3].type = PTR_TO_MAP_VALUE;
 +      __mark_reg_known_zero(&callee->regs[BPF_REG_3]);
 +      callee->regs[BPF_REG_3].map_ptr = caller->regs[BPF_REG_1].map_ptr;
 +
 +      /* pointer to stack or null */
 +      callee->regs[BPF_REG_4] = caller->regs[BPF_REG_3];
 +
 +      /* unused */
 +      __mark_reg_not_init(env, &callee->regs[BPF_REG_5]);
 +      return 0;
 +}
 +
 +static int set_callee_state(struct bpf_verifier_env *env,
 +                          struct bpf_func_state *caller,
 +                          struct bpf_func_state *callee, int insn_idx)
 +{
 +      int i;
 +
 +      /* copy r1 - r5 args that callee can access.  The copy includes parent
 +       * pointers, which connects us up to the liveness chain
 +       */
 +      for (i = BPF_REG_1; i <= BPF_REG_5; i++)
 +              callee->regs[i] = caller->regs[i];
 +      return 0;
 +}
 +
 +static int check_func_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 +                         int *insn_idx)
 +{
 +      int subprog, target_insn;
 +
 +      target_insn = *insn_idx + insn->imm + 1;
 +      subprog = find_subprog(env, target_insn);
 +      if (subprog < 0) {
 +              verbose(env, "verifier bug. No program starts at insn %d\n",
 +                      target_insn);
 +              return -EFAULT;
 +      }
 +
 +      return __check_func_call(env, insn, insn_idx, subprog, set_callee_state);
 +}
 +
 +static int set_map_elem_callback_state(struct bpf_verifier_env *env,
 +                                     struct bpf_func_state *caller,
 +                                     struct bpf_func_state *callee,
 +                                     int insn_idx)
 +{
 +      struct bpf_insn_aux_data *insn_aux = &env->insn_aux_data[insn_idx];
 +      struct bpf_map *map;
 +      int err;
 +
 +      if (bpf_map_ptr_poisoned(insn_aux)) {
 +              verbose(env, "tail_call abusing map_ptr\n");
 +              return -EINVAL;
 +      }
 +
 +      map = BPF_MAP_PTR(insn_aux->map_ptr_state);
 +      if (!map->ops->map_set_for_each_callback_args ||
 +          !map->ops->map_for_each_callback) {
 +              verbose(env, "callback function not allowed for map\n");
 +              return -ENOTSUPP;
 +      }
 +
 +      err = map->ops->map_set_for_each_callback_args(env, caller, callee);
 +      if (err)
 +              return err;
 +
 +      callee->in_callback_fn = true;
 +      return 0;
 +}
 +
  static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx)
  {
        struct bpf_verifier_state *state = env->cur_state;
  
        state->curframe--;
        caller = state->frame[state->curframe];
 -      /* return to the caller whatever r0 had in the callee */
 -      caller->regs[BPF_REG_0] = *r0;
 +      if (callee->in_callback_fn) {
 +              /* enforce R0 return value range [0, 1]. */
 +              struct tnum range = tnum_range(0, 1);
 +
 +              if (r0->type != SCALAR_VALUE) {
 +                      verbose(env, "R0 not a scalar value\n");
 +                      return -EACCES;
 +              }
 +              if (!tnum_in(range, r0->var_off)) {
 +                      verbose_invalid_scalar(env, r0, &range, "callback return", "R0");
 +                      return -EINVAL;
 +              }
 +      } else {
 +              /* return to the caller whatever r0 had in the callee */
 +              caller->regs[BPF_REG_0] = *r0;
 +      }
  
        /* Transfer references to the caller */
        err = transfer_reference_state(caller, callee);
@@@ -5608,9 -5436,7 +5608,9 @@@ record_func_map(struct bpf_verifier_en
            func_id != BPF_FUNC_map_delete_elem &&
            func_id != BPF_FUNC_map_push_elem &&
            func_id != BPF_FUNC_map_pop_elem &&
 -          func_id != BPF_FUNC_map_peek_elem)
 +          func_id != BPF_FUNC_map_peek_elem &&
 +          func_id != BPF_FUNC_for_each_map_elem &&
 +          func_id != BPF_FUNC_redirect_map)
                return 0;
  
        if (map == NULL) {
@@@ -5691,18 -5517,15 +5691,18 @@@ static int check_reference_leak(struct 
        return state->acquired_refs ? -EINVAL : 0;
  }
  
 -static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
 +static int check_helper_call(struct bpf_verifier_env *env, struct bpf_insn *insn,
 +                           int *insn_idx_p)
  {
        const struct bpf_func_proto *fn = NULL;
        struct bpf_reg_state *regs;
        struct bpf_call_arg_meta meta;
 +      int insn_idx = *insn_idx_p;
        bool changes_data;
 -      int i, err;
 +      int i, err, func_id;
  
        /* find function prototype */
 +      func_id = insn->imm;
        if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
                verbose(env, "invalid func %s#%d\n", func_id_name(func_id),
                        func_id);
  
        meta.func_id = func_id;
        /* check args */
 -      for (i = 0; i < 5; i++) {
 +      for (i = 0; i < MAX_BPF_FUNC_REG_ARGS; i++) {
                err = check_func_arg(env, i, &meta, fn);
                if (err)
                        return err;
                return -EINVAL;
        }
  
 +      if (func_id == BPF_FUNC_for_each_map_elem) {
 +              err = __check_func_call(env, insn, insn_idx_p, meta.subprogno,
 +                                      set_map_elem_callback_state);
 +              if (err < 0)
 +                      return -EINVAL;
 +      }
 +
        /* reset caller saved regs */
        for (i = 0; i < CALLER_SAVED_REGS; i++) {
                mark_reg_not_init(env, regs, caller_saved[i]);
@@@ -6045,10 -5861,14 +6045,14 @@@ static int retrieve_ptr_limit(const str
  {
        bool mask_to_left = (opcode == BPF_ADD &&  off_is_neg) ||
                            (opcode == BPF_SUB && !off_is_neg);
-       u32 off;
+       u32 off, max;
  
        switch (ptr_reg->type) {
        case PTR_TO_STACK:
+               /* Offset 0 is out-of-bounds, but acceptable start for the
+                * left direction, see BPF_REG_FP.
+                */
+               max = MAX_BPF_STACK + mask_to_left;
                /* Indirect variable offset stack access is prohibited in
                 * unprivileged mode so it's not handled here.
                 */
                if (mask_to_left)
                        *ptr_limit = MAX_BPF_STACK + off;
                else
-                       *ptr_limit = -off;
-               return 0;
+                       *ptr_limit = -off - 1;
+               return *ptr_limit >= max ? -ERANGE : 0;
 +      case PTR_TO_MAP_KEY:
 +              /* Currently, this code is not exercised as the only use
 +               * is bpf_for_each_map_elem() helper which requires
 +               * bpf_capble. The code has been tested manually for
 +               * future use.
 +               */
 +              if (mask_to_left) {
 +                      *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
 +              } else {
 +                      off = ptr_reg->smin_value + ptr_reg->off;
 +                      *ptr_limit = ptr_reg->map_ptr->key_size - off;
 +              }
 +              return 0;
        case PTR_TO_MAP_VALUE:
+               max = ptr_reg->map_ptr->value_size;
                if (mask_to_left) {
                        *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
                } else {
                        off = ptr_reg->smin_value + ptr_reg->off;
-                       *ptr_limit = ptr_reg->map_ptr->value_size - off;
+                       *ptr_limit = ptr_reg->map_ptr->value_size - off - 1;
                }
-               return 0;
+               return *ptr_limit >= max ? -ERANGE : 0;
        default:
                return -EINVAL;
        }
@@@ -6101,7 -5909,7 +6106,7 @@@ static int update_alu_sanitation_state(
             aux->alu_limit != alu_limit))
                return -EACCES;
  
 -      /* Corresponding fixup done in fixup_bpf_calls(). */
 +      /* Corresponding fixup done in do_misc_fixups(). */
        aux->alu_state = alu_state;
        aux->alu_limit = alu_limit;
        return 0;
@@@ -6131,6 -5939,7 +6136,7 @@@ static int sanitize_ptr_alu(struct bpf_
        u32 alu_state, alu_limit;
        struct bpf_reg_state tmp;
        bool ret;
+       int err;
  
        if (can_skip_alu_sanitation(env, insn))
                return 0;
        alu_state |= ptr_is_dst_reg ?
                     BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
  
-       if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
-               return 0;
-       if (update_alu_sanitation_state(aux, alu_state, alu_limit))
-               return -EACCES;
+       err = retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg);
+       if (err < 0)
+               return err;
+       err = update_alu_sanitation_state(aux, alu_state, alu_limit);
+       if (err < 0)
+               return err;
  do_sim:
        /* Simulate and find potential out-of-bounds access under
         * speculative execution from truncation as a result of
@@@ -6272,7 -6084,6 +6281,7 @@@ static int adjust_ptr_min_max_vals(stru
                verbose(env, "R%d pointer arithmetic on %s prohibited\n",
                        dst, reg_type_str[ptr_reg->type]);
                return -EACCES;
 +      case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
                        verbose(env, "R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
        case BPF_ADD:
                ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                if (ret < 0) {
-                       verbose(env, "R%d tried to add from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to add from different maps, paths, or prohibited types\n", dst);
                        return ret;
                }
                /* We can take a fixed offset as long as it doesn't overflow
        case BPF_SUB:
                ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
                if (ret < 0) {
-                       verbose(env, "R%d tried to sub from different maps or paths\n", dst);
+                       verbose(env, "R%d tried to sub from different maps, paths, or prohibited types\n", dst);
                        return ret;
                }
                if (dst_reg == off_reg) {
@@@ -8452,24 -8263,6 +8461,24 @@@ static int check_ld_imm(struct bpf_veri
                return 0;
        }
  
 +      if (insn->src_reg == BPF_PSEUDO_FUNC) {
 +              struct bpf_prog_aux *aux = env->prog->aux;
 +              u32 subprogno = insn[1].imm;
 +
 +              if (!aux->func_info) {
 +                      verbose(env, "missing btf func_info\n");
 +                      return -EINVAL;
 +              }
 +              if (aux->func_info_aux[subprogno].linkage != BTF_FUNC_STATIC) {
 +                      verbose(env, "callback function not static\n");
 +                      return -EINVAL;
 +              }
 +
 +              dst_reg->type = PTR_TO_FUNC;
 +              dst_reg->subprogno = subprogno;
 +              return 0;
 +      }
 +
        map = env->used_maps[aux->map_index];
        mark_reg_known_zero(env, regs, insn->dst_reg);
        dst_reg->map_ptr = map;
@@@ -8698,7 -8491,17 +8707,7 @@@ static int check_return_code(struct bpf
        }
  
        if (!tnum_in(range, reg->var_off)) {
 -              char tn_buf[48];
 -
 -              verbose(env, "At program exit the register R0 ");
 -              if (!tnum_is_unknown(reg->var_off)) {
 -                      tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 -                      verbose(env, "has value %s", tn_buf);
 -              } else {
 -                      verbose(env, "has unknown scalar value");
 -              }
 -              tnum_strn(tn_buf, sizeof(tn_buf), range);
 -              verbose(env, " should have been in %s\n", tn_buf);
 +              verbose_invalid_scalar(env, reg, &range, "program exit", "R0");
                return -EINVAL;
        }
  
@@@ -8825,27 -8628,6 +8834,27 @@@ static int push_insn(int t, int w, int 
        return DONE_EXPLORING;
  }
  
 +static int visit_func_call_insn(int t, int insn_cnt,
 +                              struct bpf_insn *insns,
 +                              struct bpf_verifier_env *env,
 +                              bool visit_callee)
 +{
 +      int ret;
 +
 +      ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 +      if (ret)
 +              return ret;
 +
 +      if (t + 1 < insn_cnt)
 +              init_explored_state(env, t + 1);
 +      if (visit_callee) {
 +              init_explored_state(env, t);
 +              ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
 +                              env, false);
 +      }
 +      return ret;
 +}
 +
  /* Visits the instruction at index t and returns one of the following:
   *  < 0 - an error occurred
   *  DONE_EXPLORING - the instruction was fully explored
@@@ -8856,9 -8638,6 +8865,9 @@@ static int visit_insn(int t, int insn_c
        struct bpf_insn *insns = env->prog->insnsi;
        int ret;
  
 +      if (bpf_pseudo_func(insns + t))
 +              return visit_func_call_insn(t, insn_cnt, insns, env, true);
 +
        /* All non-branch instructions have a single fall-through edge. */
        if (BPF_CLASS(insns[t].code) != BPF_JMP &&
            BPF_CLASS(insns[t].code) != BPF_JMP32)
                return DONE_EXPLORING;
  
        case BPF_CALL:
 -              ret = push_insn(t, t + 1, FALLTHROUGH, env, false);
 -              if (ret)
 -                      return ret;
 -
 -              if (t + 1 < insn_cnt)
 -                      init_explored_state(env, t + 1);
 -              if (insns[t].src_reg == BPF_PSEUDO_CALL) {
 -                      init_explored_state(env, t);
 -                      ret = push_insn(t, t + insns[t].imm + 1, BRANCH,
 -                                      env, false);
 -              }
 -              return ret;
 +              return visit_func_call_insn(t, insn_cnt, insns, env,
 +                                          insns[t].src_reg == BPF_PSEUDO_CALL);
  
        case BPF_JA:
                if (BPF_SRC(insns[t].code) != BPF_K)
@@@ -9276,6 -9065,10 +9285,10 @@@ static int check_btf_info(struct bpf_ve
        btf = btf_get_by_fd(attr->prog_btf_fd);
        if (IS_ERR(btf))
                return PTR_ERR(btf);
+       if (btf_is_kernel(btf)) {
+               btf_put(btf);
+               return -EACCES;
+       }
        env->prog->aux->btf = btf;
  
        err = check_btf_func(env, attr, uattr);
@@@ -9479,7 -9272,6 +9492,7 @@@ static bool regsafe(struct bpf_reg_stat
                         */
                        return false;
                }
 +      case PTR_TO_MAP_KEY:
        case PTR_TO_MAP_VALUE:
                /* If the new min/max/var_off satisfy the old ones and
                 * everything else matches, we are OK.
@@@ -10326,9 -10118,10 +10339,9 @@@ static int do_check(struct bpf_verifier
                                if (insn->src_reg == BPF_PSEUDO_CALL)
                                        err = check_func_call(env, insn, &env->insn_idx);
                                else
 -                                      err = check_helper_call(env, insn->imm, env->insn_idx);
 +                                      err = check_helper_call(env, insn, &env->insn_idx);
                                if (err)
                                        return err;
 -
                        } else if (opcode == BPF_JA) {
                                if (BPF_SRC(insn->code) != BPF_K ||
                                    insn->imm != 0 ||
@@@ -10757,12 -10550,6 +10770,12 @@@ static int resolve_pseudo_ldimm64(struc
                                goto next_insn;
                        }
  
 +                      if (insn[0].src_reg == BPF_PSEUDO_FUNC) {
 +                              aux = &env->insn_aux_data[i];
 +                              aux->ptr_type = PTR_TO_FUNC;
 +                              goto next_insn;
 +                      }
 +
                        /* In final convert_pseudo_ld_imm64() step, this is
                         * converted into regular 64-bit imm load insn.
                         */
@@@ -10895,13 -10682,9 +10908,13 @@@ static void convert_pseudo_ld_imm64(str
        int insn_cnt = env->prog->len;
        int i;
  
 -      for (i = 0; i < insn_cnt; i++, insn++)
 -              if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
 -                      insn->src_reg = 0;
 +      for (i = 0; i < insn_cnt; i++, insn++) {
 +              if (insn->code != (BPF_LD | BPF_IMM | BPF_DW))
 +                      continue;
 +              if (insn->src_reg == BPF_PSEUDO_FUNC)
 +                      continue;
 +              insn->src_reg = 0;
 +      }
  }
  
  /* single env->prog->insni[off] instruction was replaced with the range
@@@ -11540,12 -11323,6 +11553,12 @@@ static int jit_subprogs(struct bpf_veri
                return 0;
  
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 +              if (bpf_pseudo_func(insn)) {
 +                      env->insn_aux_data[i].call_imm = insn->imm;
 +                      /* subprog is encoded in insn[1].imm */
 +                      continue;
 +              }
 +
                if (!bpf_pseudo_call(insn))
                        continue;
                /* Upon error here we cannot fall back to interpreter but
        for (i = 0; i < env->subprog_cnt; i++) {
                insn = func[i]->insnsi;
                for (j = 0; j < func[i]->len; j++, insn++) {
 +                      if (bpf_pseudo_func(insn)) {
 +                              subprog = insn[1].imm;
 +                              insn[0].imm = (u32)(long)func[subprog]->bpf_func;
 +                              insn[1].imm = ((u64)(long)func[subprog]->bpf_func) >> 32;
 +                              continue;
 +                      }
                        if (!bpf_pseudo_call(insn))
                                continue;
                        subprog = insn->off;
         * later look the same as if they were interpreted only.
         */
        for (i = 0, insn = prog->insnsi; i < prog->len; i++, insn++) {
 +              if (bpf_pseudo_func(insn)) {
 +                      insn[0].imm = env->insn_aux_data[i].call_imm;
 +                      insn[1].imm = find_subprog(env, i + insn[0].imm + 1);
 +                      continue;
 +              }
                if (!bpf_pseudo_call(insn))
                        continue;
                insn->off = env->insn_aux_data[i].call_imm;
@@@ -11795,14 -11561,6 +11808,14 @@@ static int fixup_call_args(struct bpf_v
                return -EINVAL;
        }
        for (i = 0; i < prog->len; i++, insn++) {
 +              if (bpf_pseudo_func(insn)) {
 +                      /* When JIT fails the progs with callback calls
 +                       * have to be rejected, since interpreter doesn't support them yet.
 +                       */
 +                      verbose(env, "callbacks are not allowed in non-JITed programs\n");
 +                      return -EINVAL;
 +              }
 +
                if (!bpf_pseudo_call(insn))
                        continue;
                depth = get_callee_stack_depth(env, insn, i);
        return err;
  }
  
 -/* fixup insn->imm field of bpf_call instructions
 - * and inline eligible helpers as explicit sequence of BPF instructions
 - *
 - * this function is called after eBPF program passed verification
 +/* Do various post-verification rewrites in a single program pass.
 + * These rewrites simplify JIT and interpreter implementations.
   */
 -static int fixup_bpf_calls(struct bpf_verifier_env *env)
 +static int do_misc_fixups(struct bpf_verifier_env *env)
  {
        struct bpf_prog *prog = env->prog;
        bool expect_blinding = bpf_jit_blinding_enabled(prog);
        int i, ret, cnt, delta = 0;
  
        for (i = 0; i < insn_cnt; i++, insn++) {
 +              /* Make divide-by-zero exceptions impossible. */
                if (insn->code == (BPF_ALU64 | BPF_MOD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_DIV | BPF_X) ||
                    insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
                        continue;
                }
  
 +              /* Implement LD_ABS and LD_IND with a rewrite, if supported by the program type. */
                if (BPF_CLASS(insn->code) == BPF_LD &&
                    (BPF_MODE(insn->code) == BPF_ABS ||
                     BPF_MODE(insn->code) == BPF_IND)) {
                        continue;
                }
  
 +              /* Rewrite pointer arithmetic to mitigate speculation attacks. */
                if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
                    insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
                        const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
                        off_reg = issrc ? insn->src_reg : insn->dst_reg;
                        if (isneg)
                                *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
-                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+                       *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit);
                        *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
                        *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
                        *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
                     insn->imm == BPF_FUNC_map_delete_elem ||
                     insn->imm == BPF_FUNC_map_push_elem   ||
                     insn->imm == BPF_FUNC_map_pop_elem    ||
 -                   insn->imm == BPF_FUNC_map_peek_elem)) {
 +                   insn->imm == BPF_FUNC_map_peek_elem   ||
 +                   insn->imm == BPF_FUNC_redirect_map)) {
                        aux = &env->insn_aux_data[i + delta];
                        if (bpf_map_ptr_poisoned(aux))
                                goto patch_call_imm;
                                     (int (*)(struct bpf_map *map, void *value))NULL));
                        BUILD_BUG_ON(!__same_type(ops->map_peek_elem,
                                     (int (*)(struct bpf_map *map, void *value))NULL));
 +                      BUILD_BUG_ON(!__same_type(ops->map_redirect,
 +                                   (int (*)(struct bpf_map *map, u32 ifindex, u64 flags))NULL));
 +
  patch_map_ops_generic:
                        switch (insn->imm) {
                        case BPF_FUNC_map_lookup_elem:
                                insn->imm = BPF_CAST_CALL(ops->map_peek_elem) -
                                            __bpf_call_base;
                                continue;
 +                      case BPF_FUNC_redirect_map:
 +                              insn->imm = BPF_CAST_CALL(ops->map_redirect) -
 +                                          __bpf_call_base;
 +                              continue;
                        }
  
                        goto patch_call_imm;
                }
  
 +              /* Implement bpf_jiffies64 inline. */
                if (prog->jit_requested && BITS_PER_LONG == 64 &&
                    insn->imm == BPF_FUNC_jiffies64) {
                        struct bpf_insn ld_jiffies_addr[2] = {
@@@ -12935,7 -12683,7 +12948,7 @@@ skip_full_check
                ret = convert_ctx_accesses(env);
  
        if (ret == 0)
 -              ret = fixup_bpf_calls(env);
 +              ret = do_misc_fixups(env);
  
        /* do 32-bit optimization after insn patching has done so those patched
         * insns could be handled correctly.
diff --combined kernel/fork.c
@@@ -96,7 -96,6 +96,7 @@@
  #include <linux/kasan.h>
  #include <linux/scs.h>
  #include <linux/io_uring.h>
 +#include <linux/bpf.h>
  
  #include <asm/pgalloc.h>
  #include <linux/uaccess.h>
@@@ -735,7 -734,6 +735,7 @@@ void __put_task_struct(struct task_stru
        cgroup_free(tsk);
        task_numa_free(tsk, true);
        security_task_free(tsk);
 +      bpf_task_storage_free(tsk);
        exit_creds(tsk);
        delayacct_tsk_free(tsk);
        put_signal_struct(tsk->signal);
@@@ -996,6 -994,13 +996,13 @@@ static void mm_init_owner(struct mm_str
  #endif
  }
  
+ static void mm_init_pasid(struct mm_struct *mm)
+ {
+ #ifdef CONFIG_IOMMU_SUPPORT
+       mm->pasid = INIT_PASID;
+ #endif
+ }
  static void mm_init_uprobes_state(struct mm_struct *mm)
  {
  #ifdef CONFIG_UPROBES
@@@ -1026,6 -1031,7 +1033,7 @@@ static struct mm_struct *mm_init(struc
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
+       mm_init_pasid(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
@@@ -2066,9 -2072,6 +2074,9 @@@ static __latent_entropy struct task_str
        p->sequential_io        = 0;
        p->sequential_io_avg    = 0;
  #endif
 +#ifdef CONFIG_BPF_SYSCALL
 +      RCU_INIT_POINTER(p->bpf_storage, NULL);
 +#endif
  
        /* Perform scheduler related setup. Assign this task to a CPU. */
        retval = sched_fork(clone_flags, p);
diff --combined net/core/dev.c
@@@ -848,52 -848,6 +848,52 @@@ int dev_fill_metadata_dst(struct net_de
  }
  EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
  
 +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack)
 +{
 +      int k = stack->num_paths++;
 +
 +      if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX))
 +              return NULL;
 +
 +      return &stack->path[k];
 +}
 +
 +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr,
 +                        struct net_device_path_stack *stack)
 +{
 +      const struct net_device *last_dev;
 +      struct net_device_path_ctx ctx = {
 +              .dev    = dev,
 +              .daddr  = daddr,
 +      };
 +      struct net_device_path *path;
 +      int ret = 0;
 +
 +      stack->num_paths = 0;
 +      while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) {
 +              last_dev = ctx.dev;
 +              path = dev_fwd_path(stack);
 +              if (!path)
 +                      return -1;
 +
 +              memset(path, 0, sizeof(struct net_device_path));
 +              ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path);
 +              if (ret < 0)
 +                      return -1;
 +
 +              if (WARN_ON_ONCE(last_dev == ctx.dev))
 +                      return -1;
 +      }
 +      path = dev_fwd_path(stack);
 +      if (!path)
 +              return -1;
 +      path->type = DEV_PATH_ETHERNET;
 +      path->dev = ctx.dev;
 +
 +      return ret;
 +}
 +EXPORT_SYMBOL_GPL(dev_fill_forward_path);
 +
  /**
   *    __dev_get_by_name       - find a device by its name
   *    @net: the applicable net namespace
@@@ -1230,6 -1184,18 +1230,18 @@@ static int __dev_alloc_name(struct net 
                        return -ENOMEM;
  
                for_each_netdev(net, d) {
+                       struct netdev_name_node *name_node;
+                       list_for_each_entry(name_node, &d->name_node->list, list) {
+                               if (!sscanf(name_node->name, name, &i))
+                                       continue;
+                               if (i < 0 || i >= max_netdevices)
+                                       continue;
+                               /*  avoid cases where sscanf is not exact inverse of printf */
+                               snprintf(buf, IFNAMSIZ, name, i);
+                               if (!strncmp(buf, name_node->name, IFNAMSIZ))
+                                       set_bit(i, inuse);
+                       }
                        if (!sscanf(d->name, name, &i))
                                continue;
                        if (i < 0 || i >= max_netdevices)
@@@ -2497,14 -2463,16 +2509,14 @@@ int netdev_txq_to_tc(struct net_device 
  EXPORT_SYMBOL(netdev_txq_to_tc);
  
  #ifdef CONFIG_XPS
 -struct static_key xps_needed __read_mostly;
 -EXPORT_SYMBOL(xps_needed);
 -struct static_key xps_rxqs_needed __read_mostly;
 -EXPORT_SYMBOL(xps_rxqs_needed);
 +static struct static_key xps_needed __read_mostly;
 +static struct static_key xps_rxqs_needed __read_mostly;
  static DEFINE_MUTEX(xps_map_mutex);
  #define xmap_dereference(P)           \
        rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
  
  static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
 -                           int tci, u16 index)
 +                           struct xps_dev_maps *old_maps, int tci, u16 index)
  {
        struct xps_map *map = NULL;
        int pos;
                        break;
                }
  
 +              if (old_maps)
 +                      RCU_INIT_POINTER(old_maps->attr_map[tci], NULL);
                RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
                kfree_rcu(map, rcu);
                return false;
@@@ -2537,7 -2503,7 +2549,7 @@@ static bool remove_xps_queue_cpu(struc
                                 struct xps_dev_maps *dev_maps,
                                 int cpu, u16 offset, u16 count)
  {
 -      int num_tc = dev->num_tc ? : 1;
 +      int num_tc = dev_maps->num_tc;
        bool active = false;
        int tci;
  
                int i, j;
  
                for (i = count, j = offset; i--; j++) {
 -                      if (!remove_xps_queue(dev_maps, tci, j))
 +                      if (!remove_xps_queue(dev_maps, NULL, tci, j))
                                break;
                }
  
  
  static void reset_xps_maps(struct net_device *dev,
                           struct xps_dev_maps *dev_maps,
 -                         bool is_rxqs_map)
 +                         enum xps_map_type type)
  {
 -      if (is_rxqs_map) {
 -              static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 -              RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
 -      } else {
 -              RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
 -      }
        static_key_slow_dec_cpuslocked(&xps_needed);
 +      if (type == XPS_RXQS)
 +              static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
 +
 +      RCU_INIT_POINTER(dev->xps_maps[type], NULL);
 +
        kfree_rcu(dev_maps, rcu);
  }
  
 -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
 -                         struct xps_dev_maps *dev_maps, unsigned int nr_ids,
 -                         u16 offset, u16 count, bool is_rxqs_map)
 +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type,
 +                         u16 offset, u16 count)
  {
 +      struct xps_dev_maps *dev_maps;
        bool active = false;
        int i, j;
  
 -      for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
 -           j < nr_ids;)
 -              active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
 -                                             count);
 +      dev_maps = xmap_dereference(dev->xps_maps[type]);
 +      if (!dev_maps)
 +              return;
 +
 +      for (j = 0; j < dev_maps->nr_ids; j++)
 +              active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count);
        if (!active)
 -              reset_xps_maps(dev, dev_maps, is_rxqs_map);
 +              reset_xps_maps(dev, dev_maps, type);
  
 -      if (!is_rxqs_map) {
 -              for (i = offset + (count - 1); count--; i--) {
 +      if (type == XPS_CPUS) {
 +              for (i = offset + (count - 1); count--; i--)
                        netdev_queue_numa_node_write(
 -                              netdev_get_tx_queue(dev, i),
 -                              NUMA_NO_NODE);
 -              }
 +                              netdev_get_tx_queue(dev, i), NUMA_NO_NODE);
        }
  }
  
  static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
                                   u16 count)
  {
 -      const unsigned long *possible_mask = NULL;
 -      struct xps_dev_maps *dev_maps;
 -      unsigned int nr_ids;
 -
        if (!static_key_false(&xps_needed))
                return;
  
        cpus_read_lock();
        mutex_lock(&xps_map_mutex);
  
 -      if (static_key_false(&xps_rxqs_needed)) {
 -              dev_maps = xmap_dereference(dev->xps_rxqs_map);
 -              if (dev_maps) {
 -                      nr_ids = dev->num_rx_queues;
 -                      clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
 -                                     offset, count, true);
 -              }
 -      }
 -
 -      dev_maps = xmap_dereference(dev->xps_cpus_map);
 -      if (!dev_maps)
 -              goto out_no_maps;
 +      if (static_key_false(&xps_rxqs_needed))
 +              clean_xps_maps(dev, XPS_RXQS, offset, count);
  
 -      if (num_possible_cpus() > 1)
 -              possible_mask = cpumask_bits(cpu_possible_mask);
 -      nr_ids = nr_cpu_ids;
 -      clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
 -                     false);
 +      clean_xps_maps(dev, XPS_CPUS, offset, count);
  
 -out_no_maps:
        mutex_unlock(&xps_map_mutex);
        cpus_read_unlock();
  }
@@@ -2654,35 -2640,16 +2666,35 @@@ static struct xps_map *expand_xps_map(s
        return new_map;
  }
  
 +/* Copy xps maps at a given index */
 +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps,
 +                            struct xps_dev_maps *new_dev_maps, int index,
 +                            int tc, bool skip_tc)
 +{
 +      int i, tci = index * dev_maps->num_tc;
 +      struct xps_map *map;
 +
 +      /* copy maps belonging to foreign traffic classes */
 +      for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 +              if (i == tc && skip_tc)
 +                      continue;
 +
 +              /* fill in the new device map from the old device map */
 +              map = xmap_dereference(dev_maps->attr_map[tci]);
 +              RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 +      }
 +}
 +
  /* Must be called under cpus_read_lock */
  int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
 -                        u16 index, bool is_rxqs_map)
 +                        u16 index, enum xps_map_type type)
  {
 -      const unsigned long *online_mask = NULL, *possible_mask = NULL;
 -      struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
 +      struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL;
 +      const unsigned long *online_mask = NULL;
 +      bool active = false, copy = false;
        int i, j, tci, numa_node_id = -2;
        int maps_sz, num_tc = 1, tc = 0;
        struct xps_map *map, *new_map;
 -      bool active = false;
        unsigned int nr_ids;
  
        if (dev->num_tc) {
        }
  
        mutex_lock(&xps_map_mutex);
 -      if (is_rxqs_map) {
 +
 +      dev_maps = xmap_dereference(dev->xps_maps[type]);
 +      if (type == XPS_RXQS) {
                maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
 -              dev_maps = xmap_dereference(dev->xps_rxqs_map);
                nr_ids = dev->num_rx_queues;
        } else {
                maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
 -              if (num_possible_cpus() > 1) {
 +              if (num_possible_cpus() > 1)
                        online_mask = cpumask_bits(cpu_online_mask);
 -                      possible_mask = cpumask_bits(cpu_possible_mask);
 -              }
 -              dev_maps = xmap_dereference(dev->xps_cpus_map);
                nr_ids = nr_cpu_ids;
        }
  
        if (maps_sz < L1_CACHE_BYTES)
                maps_sz = L1_CACHE_BYTES;
  
 +      /* The old dev_maps could be larger or smaller than the one we're
 +       * setting up now, as dev->num_tc or nr_ids could have been updated in
 +       * between. We could try to be smart, but let's be safe instead and only
 +       * copy foreign traffic classes if the two map sizes match.
 +       */
 +      if (dev_maps &&
 +          dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids)
 +              copy = true;
 +
        /* allocate memory for queue storage */
        for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
             j < nr_ids;) {
 -              if (!new_dev_maps)
 -                      new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
                if (!new_dev_maps) {
 -                      mutex_unlock(&xps_map_mutex);
 -                      return -ENOMEM;
 +                      new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
 +                      if (!new_dev_maps) {
 +                              mutex_unlock(&xps_map_mutex);
 +                              return -ENOMEM;
 +                      }
 +
 +                      new_dev_maps->nr_ids = nr_ids;
 +                      new_dev_maps->num_tc = num_tc;
                }
  
                tci = j * num_tc + tc;
 -              map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
 -                               NULL;
 +              map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL;
  
 -              map = expand_xps_map(map, j, index, is_rxqs_map);
 +              map = expand_xps_map(map, j, index, type == XPS_RXQS);
                if (!map)
                        goto error;
  
        if (!dev_maps) {
                /* Increment static keys at most once per type */
                static_key_slow_inc_cpuslocked(&xps_needed);
 -              if (is_rxqs_map)
 +              if (type == XPS_RXQS)
                        static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
        }
  
 -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 -           j < nr_ids;) {
 -              /* copy maps belonging to foreign traffic classes */
 -              for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
 -                      /* fill in the new device map from the old device map */
 -                      map = xmap_dereference(dev_maps->attr_map[tci]);
 -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 -              }
 +      for (j = 0; j < nr_ids; j++) {
 +              bool skip_tc = false;
  
 -              /* We need to explicitly update tci as prevous loop
 -               * could break out early if dev_maps is NULL.
 -               */
                tci = j * num_tc + tc;
 -
                if (netif_attr_test_mask(j, mask, nr_ids) &&
                    netif_attr_test_online(j, online_mask, nr_ids)) {
                        /* add tx-queue to CPU/rx-queue maps */
                        int pos = 0;
  
 +                      skip_tc = true;
 +
                        map = xmap_dereference(new_dev_maps->attr_map[tci]);
                        while ((pos < map->len) && (map->queues[pos] != index))
                                pos++;
                        if (pos == map->len)
                                map->queues[map->len++] = index;
  #ifdef CONFIG_NUMA
 -                      if (!is_rxqs_map) {
 +                      if (type == XPS_CPUS) {
                                if (numa_node_id == -2)
                                        numa_node_id = cpu_to_node(j);
                                else if (numa_node_id != cpu_to_node(j))
                                        numa_node_id = -1;
                        }
  #endif
 -              } else if (dev_maps) {
 -                      /* fill in the new device map from the old device map */
 -                      map = xmap_dereference(dev_maps->attr_map[tci]);
 -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
                }
  
 -              /* copy maps belonging to foreign traffic classes */
 -              for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
 -                      /* fill in the new device map from the old device map */
 -                      map = xmap_dereference(dev_maps->attr_map[tci]);
 -                      RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
 -              }
 +              if (copy)
 +                      xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc,
 +                                        skip_tc);
        }
  
 -      if (is_rxqs_map)
 -              rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
 -      else
 -              rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
 +      rcu_assign_pointer(dev->xps_maps[type], new_dev_maps);
  
        /* Cleanup old maps */
        if (!dev_maps)
                goto out_no_old_maps;
  
 -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 -           j < nr_ids;) {
 -              for (i = num_tc, tci = j * num_tc; i--; tci++) {
 -                      new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 +      for (j = 0; j < dev_maps->nr_ids; j++) {
 +              for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) {
                        map = xmap_dereference(dev_maps->attr_map[tci]);
 -                      if (map && map != new_map)
 -                              kfree_rcu(map, rcu);
 +                      if (!map)
 +                              continue;
 +
 +                      if (copy) {
 +                              new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 +                              if (map == new_map)
 +                                      continue;
 +                      }
 +
 +                      RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
 +                      kfree_rcu(map, rcu);
                }
        }
  
 -      kfree_rcu(dev_maps, rcu);
 +      old_dev_maps = dev_maps;
  
  out_no_old_maps:
        dev_maps = new_dev_maps;
        active = true;
  
  out_no_new_maps:
 -      if (!is_rxqs_map) {
 +      if (type == XPS_CPUS)
                /* update Tx queue numa node */
                netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
                                             (numa_node_id >= 0) ?
                                             numa_node_id : NUMA_NO_NODE);
 -      }
  
        if (!dev_maps)
                goto out_no_maps;
  
        /* removes tx-queue from unused CPUs/rx-queues */
 -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 -           j < nr_ids;) {
 -              for (i = tc, tci = j * num_tc; i--; tci++)
 -                      active |= remove_xps_queue(dev_maps, tci, index);
 -              if (!netif_attr_test_mask(j, mask, nr_ids) ||
 -                  !netif_attr_test_online(j, online_mask, nr_ids))
 -                      active |= remove_xps_queue(dev_maps, tci, index);
 -              for (i = num_tc - tc, tci++; --i; tci++)
 -                      active |= remove_xps_queue(dev_maps, tci, index);
 +      for (j = 0; j < dev_maps->nr_ids; j++) {
 +              tci = j * dev_maps->num_tc;
 +
 +              for (i = 0; i < dev_maps->num_tc; i++, tci++) {
 +                      if (i == tc &&
 +                          netif_attr_test_mask(j, mask, dev_maps->nr_ids) &&
 +                          netif_attr_test_online(j, online_mask, dev_maps->nr_ids))
 +                              continue;
 +
 +                      active |= remove_xps_queue(dev_maps,
 +                                                 copy ? old_dev_maps : NULL,
 +                                                 tci, index);
 +              }
        }
  
 +      if (old_dev_maps)
 +              kfree_rcu(old_dev_maps, rcu);
 +
        /* free map if not active */
        if (!active)
 -              reset_xps_maps(dev, dev_maps, is_rxqs_map);
 +              reset_xps_maps(dev, dev_maps, type);
  
  out_no_maps:
        mutex_unlock(&xps_map_mutex);
        return 0;
  error:
        /* remove any maps that we added */
 -      for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
 -           j < nr_ids;) {
 +      for (j = 0; j < nr_ids; j++) {
                for (i = num_tc, tci = j * num_tc; i--; tci++) {
                        new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
 -                      map = dev_maps ?
 +                      map = copy ?
                              xmap_dereference(dev_maps->attr_map[tci]) :
                              NULL;
                        if (new_map && new_map != map)
@@@ -2882,7 -2845,7 +2894,7 @@@ int netif_set_xps_queue(struct net_devi
        int ret;
  
        cpus_read_lock();
 -      ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
 +      ret =  __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS);
        cpus_read_unlock();
  
        return ret;
@@@ -3993,15 -3956,13 +4005,15 @@@ sch_handle_egress(struct sk_buff *skb, 
  static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
                               struct xps_dev_maps *dev_maps, unsigned int tci)
  {
 +      int tc = netdev_get_prio_tc_map(dev, skb->priority);
        struct xps_map *map;
        int queue_index = -1;
  
 -      if (dev->num_tc) {
 -              tci *= dev->num_tc;
 -              tci += netdev_get_prio_tc_map(dev, skb->priority);
 -      }
 +      if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids)
 +              return queue_index;
 +
 +      tci *= dev_maps->num_tc;
 +      tci += tc;
  
        map = rcu_dereference(dev_maps->attr_map[tci]);
        if (map) {
@@@ -4032,18 -3993,18 +4044,18 @@@ static int get_xps_queue(struct net_dev
        if (!static_key_false(&xps_rxqs_needed))
                goto get_cpus_map;
  
 -      dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
 +      dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]);
        if (dev_maps) {
                int tci = sk_rx_queue_get(sk);
  
 -              if (tci >= 0 && tci < dev->num_rx_queues)
 +              if (tci >= 0)
                        queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
                                                          tci);
        }
  
  get_cpus_map:
        if (queue_index < 0) {
 -              dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
 +              dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]);
                if (dev_maps) {
                        unsigned int tci = skb->sender_cpu - 1;
  
@@@ -4345,6 -4306,13 +4357,13 @@@ static inline void ____napi_schedule(st
                 */
                thread = READ_ONCE(napi->thread);
                if (thread) {
+                       /* Avoid doing set_bit() if the thread is in
+                        * INTERRUPTIBLE state, cause napi_thread_wait()
+                        * makes sure to proceed with napi polling
+                        * if the thread is explicitly woken from here.
+                        */
+                       if (READ_ONCE(thread->state) != TASK_INTERRUPTIBLE)
+                               set_bit(NAPI_STATE_SCHED_THREADED, &napi->state);
                        wake_up_process(thread);
                        return;
                }
@@@ -5316,7 -5284,6 +5335,7 @@@ skip_classify
                        goto another_round;
                case RX_HANDLER_EXACT:
                        deliver_exact = true;
 +                      break;
                case RX_HANDLER_PASS:
                        break;
                default:
@@@ -5909,13 -5876,15 +5928,13 @@@ void napi_gro_flush(struct napi_struct 
  }
  EXPORT_SYMBOL(napi_gro_flush);
  
 -static struct list_head *gro_list_prepare(struct napi_struct *napi,
 -                                        struct sk_buff *skb)
 +static void gro_list_prepare(const struct list_head *head,
 +                           const struct sk_buff *skb)
  {
        unsigned int maclen = skb->dev->hard_header_len;
        u32 hash = skb_get_hash_raw(skb);
 -      struct list_head *head;
        struct sk_buff *p;
  
 -      head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
        list_for_each_entry(p, head, list) {
                unsigned long diffs;
  
                                       maclen);
                NAPI_GRO_CB(p)->same_flow = !diffs;
        }
 -
 -      return head;
  }
  
  static void skb_gro_reset_offset(struct sk_buff *skb)
@@@ -6003,11 -5974,11 +6022,11 @@@ static void gro_flush_oldest(struct nap
  
  static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
  {
 -      u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 +      u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
 +      struct gro_list *gro_list = &napi->gro_hash[bucket];
        struct list_head *head = &offload_base;
        struct packet_offload *ptype;
        __be16 type = skb->protocol;
 -      struct list_head *gro_head;
        struct sk_buff *pp = NULL;
        enum gro_result ret;
        int same_flow;
        if (netif_elide_gro(skb->dev))
                goto normal;
  
 -      gro_head = gro_list_prepare(napi, skb);
 +      gro_list_prepare(&gro_list->list, skb);
  
        rcu_read_lock();
        list_for_each_entry_rcu(ptype, head, list) {
  
                pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
                                        ipv6_gro_receive, inet_gro_receive,
 -                                      gro_head, skb);
 +                                      &gro_list->list, skb);
                break;
        }
        rcu_read_unlock();
        if (pp) {
                skb_list_del_init(pp);
                napi_gro_complete(napi, pp);
 -              napi->gro_hash[hash].count--;
 +              gro_list->count--;
        }
  
        if (same_flow)
        if (NAPI_GRO_CB(skb)->flush)
                goto normal;
  
 -      if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
 -              gro_flush_oldest(napi, gro_head);
 -      } else {
 -              napi->gro_hash[hash].count++;
 -      }
 +      if (unlikely(gro_list->count >= MAX_GRO_SKBS))
 +              gro_flush_oldest(napi, &gro_list->list);
 +      else
 +              gro_list->count++;
 +
        NAPI_GRO_CB(skb)->count = 1;
        NAPI_GRO_CB(skb)->age = jiffies;
        NAPI_GRO_CB(skb)->last = skb;
        skb_shinfo(skb)->gso_size = skb_gro_len(skb);
 -      list_add(&skb->list, gro_head);
 +      list_add(&skb->list, &gro_list->list);
        ret = GRO_HELD;
  
  pull:
        if (grow > 0)
                gro_pull_from_frag0(skb, grow);
  ok:
 -      if (napi->gro_hash[hash].count) {
 -              if (!test_bit(hash, &napi->gro_bitmask))
 -                      __set_bit(hash, &napi->gro_bitmask);
 -      } else if (test_bit(hash, &napi->gro_bitmask)) {
 -              __clear_bit(hash, &napi->gro_bitmask);
 +      if (gro_list->count) {
 +              if (!test_bit(bucket, &napi->gro_bitmask))
 +                      __set_bit(bucket, &napi->gro_bitmask);
 +      } else if (test_bit(bucket, &napi->gro_bitmask)) {
 +              __clear_bit(bucket, &napi->gro_bitmask);
        }
  
        return ret;
@@@ -6534,6 -6505,7 +6553,7 @@@ bool napi_complete_done(struct napi_str
                WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
  
                new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
+                             NAPIF_STATE_SCHED_THREADED |
                              NAPIF_STATE_PREFER_BUSY_POLL);
  
                /* If STATE_MISSED was set, leave STATE_SCHED set,
@@@ -6817,7 -6789,6 +6837,7 @@@ int dev_set_threaded(struct net_device 
  
        return err;
  }
 +EXPORT_SYMBOL(dev_set_threaded);
  
  void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
                    int (*poll)(struct napi_struct *, int), int weight)
@@@ -7017,16 -6988,25 +7037,25 @@@ static int napi_poll(struct napi_struc
  
  static int napi_thread_wait(struct napi_struct *napi)
  {
+       bool woken = false;
        set_current_state(TASK_INTERRUPTIBLE);
  
        while (!kthread_should_stop() && !napi_disable_pending(napi)) {
-               if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+               /* Testing SCHED_THREADED bit here to make sure the current
+                * kthread owns this napi and could poll on this napi.
+                * Testing SCHED bit is not enough because SCHED bit might be
+                * set by some other busy poll thread or by napi_disable().
+                */
+               if (test_bit(NAPI_STATE_SCHED_THREADED, &napi->state) || woken) {
                        WARN_ON(!list_empty(&napi->poll_list));
                        __set_current_state(TASK_RUNNING);
                        return 0;
                }
  
                schedule();
+               /* woken being true indicates this thread owns this napi. */
+               woken = true;
                set_current_state(TASK_INTERRUPTIBLE);
        }
        __set_current_state(TASK_RUNNING);
@@@ -10356,20 -10336,14 +10385,20 @@@ EXPORT_SYMBOL(register_netdev)
  
  int netdev_refcnt_read(const struct net_device *dev)
  {
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        int i, refcnt = 0;
  
        for_each_possible_cpu(i)
                refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
        return refcnt;
 +#else
 +      return refcount_read(&dev->dev_refcnt);
 +#endif
  }
  EXPORT_SYMBOL(netdev_refcnt_read);
  
 +int netdev_unregister_timeout_secs __read_mostly = 10;
 +
  #define WAIT_REFS_MIN_MSECS 1
  #define WAIT_REFS_MAX_MSECS 250
  /**
@@@ -10394,7 -10368,7 +10423,7 @@@ static void netdev_wait_allrefs(struct 
        rebroadcast_time = warning_time = jiffies;
        refcnt = netdev_refcnt_read(dev);
  
 -      while (refcnt != 0) {
 +      while (refcnt != 1) {
                if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
                        rtnl_lock();
  
  
                refcnt = netdev_refcnt_read(dev);
  
 -              if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
 +              if (refcnt &&
 +                  time_after(jiffies, warning_time +
 +                             netdev_unregister_timeout_secs * HZ)) {
                        pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
                                 dev->name, refcnt);
                        warning_time = jiffies;
@@@ -10509,7 -10481,7 +10538,7 @@@ void netdev_run_todo(void
                netdev_wait_allrefs(dev);
  
                /* paranoia */
 -              BUG_ON(netdev_refcnt_read(dev));
 +              BUG_ON(netdev_refcnt_read(dev) != 1);
                BUG_ON(!list_empty(&dev->ptype_all));
                BUG_ON(!list_empty(&dev->ptype_specific));
                WARN_ON(rcu_access_pointer(dev->ip_ptr));
@@@ -10726,14 -10698,9 +10755,14 @@@ struct net_device *alloc_netdev_mqs(in
        dev = PTR_ALIGN(p, NETDEV_ALIGN);
        dev->padded = (char *)dev - (char *)p;
  
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        dev->pcpu_refcnt = alloc_percpu(int);
        if (!dev->pcpu_refcnt)
                goto free_dev;
 +      dev_hold(dev);
 +#else
 +      refcount_set(&dev->dev_refcnt, 1);
 +#endif
  
        if (dev_addr_init(dev))
                goto free_pcpu;
@@@ -10797,10 -10764,8 +10826,10 @@@ free_all
        return NULL;
  
  free_pcpu:
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
  free_dev:
 +#endif
        netdev_freemem(dev);
        return NULL;
  }
@@@ -10842,10 -10807,8 +10871,10 @@@ void free_netdev(struct net_device *dev
        list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
                netif_napi_del(p);
  
 +#ifdef CONFIG_PCPU_DEV_REFCNT
        free_percpu(dev->pcpu_refcnt);
        dev->pcpu_refcnt = NULL;
 +#endif
        free_percpu(dev->xdp_bulkq);
        dev->xdp_bulkq = NULL;
  
@@@ -11412,7 -11375,7 +11441,7 @@@ static void __net_exit default_device_e
                        continue;
  
                /* Leave virtual devices for the generic cleanup */
-               if (dev->rtnl_link_ops)
+               if (dev->rtnl_link_ops && !dev->rtnl_link_ops->netns_refund)
                        continue;
  
                /* Push remaining network devices to init_net */
diff --combined net/core/drop_monitor.c
@@@ -1053,6 -1053,20 +1053,20 @@@ static int net_dm_hw_monitor_start(stru
        return 0;
  
  err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *hw_data = &per_cpu(dm_hw_cpu_data, cpu);
+               struct sk_buff *skb;
+               del_timer_sync(&hw_data->send_timer);
+               cancel_work_sync(&hw_data->dm_alert_work);
+               while ((skb = __skb_dequeue(&hw_data->drop_queue))) {
+                       struct devlink_trap_metadata *hw_metadata;
+                       hw_metadata = NET_DM_SKB_CB(skb)->hw_metadata;
+                       net_dm_hw_metadata_free(hw_metadata);
+                       consume_skb(skb);
+               }
+       }
        module_put(THIS_MODULE);
        return rc;
  }
@@@ -1134,6 -1148,15 +1148,15 @@@ static int net_dm_trace_on_set(struct n
  err_unregister_trace:
        unregister_trace_kfree_skb(ops->kfree_skb_probe, NULL);
  err_module_put:
+       for_each_possible_cpu(cpu) {
+               struct per_cpu_dm_data *data = &per_cpu(dm_cpu_data, cpu);
+               struct sk_buff *skb;
+               del_timer_sync(&data->send_timer);
+               cancel_work_sync(&data->dm_alert_work);
+               while ((skb = __skb_dequeue(&data->drop_queue)))
+                       consume_skb(skb);
+       }
        module_put(THIS_MODULE);
        return rc;
  }
@@@ -1731,7 -1754,7 +1754,7 @@@ static void exit_net_drop_monitor(void
  
        /*
         * Because of the module_get/put we do in the trace state change path
 -       * we are guarnateed not to have any current users when we get here
 +       * we are guaranteed not to have any current users when we get here
         */
  
        for_each_possible_cpu(cpu) {
diff --combined net/core/filter.c
@@@ -1863,7 -1863,10 +1863,7 @@@ static const struct bpf_func_proto bpf_
  static inline int sk_skb_try_make_writable(struct sk_buff *skb,
                                           unsigned int write_len)
  {
 -      int err = __bpf_try_make_writable(skb, write_len);
 -
 -      bpf_compute_data_end_sk_skb(skb);
 -      return err;
 +      return __bpf_try_make_writable(skb, write_len);
  }
  
  BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len)
@@@ -3409,7 -3412,6 +3409,7 @@@ static u32 bpf_skb_net_base_len(const s
                                         BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \
                                         BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \
 +                                       BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \
                                         BPF_F_ADJ_ROOM_ENCAP_L2( \
                                          BPF_ADJ_ROOM_ENCAP_L2_MASK))
  
@@@ -3446,10 -3448,6 +3446,10 @@@ static int bpf_skb_net_grow(struct sk_b
                    flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP)
                        return -EINVAL;
  
 +              if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH &&
 +                  inner_mac_len < ETH_HLEN)
 +                      return -EINVAL;
 +
                if (skb->encapsulation)
                        return -EALREADY;
  
                skb->inner_mac_header = inner_net - inner_mac_len;
                skb->inner_network_header = inner_net;
                skb->inner_transport_header = inner_trans;
 -              skb_set_inner_protocol(skb, skb->protocol);
 +
 +              if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH)
 +                      skb_set_inner_protocol(skb, htons(ETH_P_TEB));
 +              else
 +                      skb_set_inner_protocol(skb, skb->protocol);
  
                skb->encapsulation = 1;
                skb_set_network_header(skb, mac_len);
@@@ -3583,6 -3577,7 +3583,6 @@@ BPF_CALL_4(sk_skb_adjust_room, struct s
                        return -ENOMEM;
                __skb_pull(skb, len_diff_abs);
        }
 -      bpf_compute_data_end_sk_skb(skb);
        if (tls_sw_has_ctx_rx(skb->sk)) {
                struct strp_msg *rxm = strp_msg(skb);
  
@@@ -3747,7 -3742,10 +3747,7 @@@ static const struct bpf_func_proto bpf_
  BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len,
           u64, flags)
  {
 -      int ret = __bpf_skb_change_tail(skb, new_len, flags);
 -
 -      bpf_compute_data_end_sk_skb(skb);
 -      return ret;
 +      return __bpf_skb_change_tail(skb, new_len, flags);
  }
  
  static const struct bpf_func_proto sk_skb_change_tail_proto = {
@@@ -3810,7 -3808,10 +3810,7 @@@ static const struct bpf_func_proto bpf_
  BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room,
           u64, flags)
  {
 -      int ret = __bpf_skb_change_head(skb, head_room, flags);
 -
 -      bpf_compute_data_end_sk_skb(skb);
 -      return ret;
 +      return __bpf_skb_change_head(skb, head_room, flags);
  }
  
  static const struct bpf_func_proto sk_skb_change_head_proto = {
@@@ -3918,6 -3919,23 +3918,6 @@@ static const struct bpf_func_proto bpf_
        .arg2_type      = ARG_ANYTHING,
  };
  
 -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 -                          struct bpf_map *map, struct xdp_buff *xdp)
 -{
 -      switch (map->map_type) {
 -      case BPF_MAP_TYPE_DEVMAP:
 -      case BPF_MAP_TYPE_DEVMAP_HASH:
 -              return dev_map_enqueue(fwd, xdp, dev_rx);
 -      case BPF_MAP_TYPE_CPUMAP:
 -              return cpu_map_enqueue(fwd, xdp, dev_rx);
 -      case BPF_MAP_TYPE_XSKMAP:
 -              return __xsk_map_redirect(fwd, xdp);
 -      default:
 -              return -EBADRQC;
 -      }
 -      return 0;
 -}
 -
  void xdp_do_flush(void)
  {
        __dev_flush();
  }
  EXPORT_SYMBOL_GPL(xdp_do_flush);
  
 -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index)
 -{
 -      switch (map->map_type) {
 -      case BPF_MAP_TYPE_DEVMAP:
 -              return __dev_map_lookup_elem(map, index);
 -      case BPF_MAP_TYPE_DEVMAP_HASH:
 -              return __dev_map_hash_lookup_elem(map, index);
 -      case BPF_MAP_TYPE_CPUMAP:
 -              return __cpu_map_lookup_elem(map, index);
 -      case BPF_MAP_TYPE_XSKMAP:
 -              return __xsk_map_lookup_elem(map, index);
 -      default:
 -              return NULL;
 -      }
 -}
 -
 -void bpf_clear_redirect_map(struct bpf_map *map)
 -{
 -      struct bpf_redirect_info *ri;
 -      int cpu;
 -
 -      for_each_possible_cpu(cpu) {
 -              ri = per_cpu_ptr(&bpf_redirect_info, cpu);
 -              /* Avoid polluting remote cacheline due to writes if
 -               * not needed. Once we pass this test, we need the
 -               * cmpxchg() to make sure it hasn't been changed in
 -               * the meantime by remote CPU.
 -               */
 -              if (unlikely(READ_ONCE(ri->map) == map))
 -                      cmpxchg(&ri->map, map, NULL);
 -      }
 -}
 -
  int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp,
                    struct bpf_prog *xdp_prog)
  {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 -      struct bpf_map *map = READ_ONCE(ri->map);
 -      u32 index = ri->tgt_index;
 +      enum bpf_map_type map_type = ri->map_type;
        void *fwd = ri->tgt_value;
 +      u32 map_id = ri->map_id;
        int err;
  
 -      ri->tgt_index = 0;
 -      ri->tgt_value = NULL;
 -      WRITE_ONCE(ri->map, NULL);
 +      ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
 +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
  
 -      if (unlikely(!map)) {
 -              fwd = dev_get_by_index_rcu(dev_net(dev), index);
 -              if (unlikely(!fwd)) {
 -                      err = -EINVAL;
 -                      goto err;
 +      switch (map_type) {
 +      case BPF_MAP_TYPE_DEVMAP:
 +              fallthrough;
 +      case BPF_MAP_TYPE_DEVMAP_HASH:
 +              err = dev_map_enqueue(fwd, xdp, dev);
 +              break;
 +      case BPF_MAP_TYPE_CPUMAP:
 +              err = cpu_map_enqueue(fwd, xdp, dev);
 +              break;
 +      case BPF_MAP_TYPE_XSKMAP:
 +              err = __xsk_map_redirect(fwd, xdp);
 +              break;
 +      case BPF_MAP_TYPE_UNSPEC:
 +              if (map_id == INT_MAX) {
 +                      fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
 +                      if (unlikely(!fwd)) {
 +                              err = -EINVAL;
 +                              break;
 +                      }
 +                      err = dev_xdp_enqueue(fwd, xdp, dev);
 +                      break;
                }
 -
 -              err = dev_xdp_enqueue(fwd, xdp, dev);
 -      } else {
 -              err = __bpf_tx_xdp_map(dev, fwd, map, xdp);
 +              fallthrough;
 +      default:
 +              err = -EBADRQC;
        }
  
        if (unlikely(err))
                goto err;
  
 -      _trace_xdp_redirect_map(dev, xdp_prog, fwd, mapindex);
 +      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
  err:
 -      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, mapindex, err);
 +      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
  }
  EXPORT_SYMBOL_GPL(xdp_do_redirect);
@@@ -3980,36 -4017,41 +3980,36 @@@ static int xdp_do_generic_redirect_map(
                                       struct sk_buff *skb,
                                       struct xdp_buff *xdp,
                                       struct bpf_prog *xdp_prog,
 -                                     struct bpf_map *map)
 +                                     void *fwd,
 +                                     enum bpf_map_type map_type, u32 map_id)
  {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 -      u32 index = ri->tgt_index;
 -      void *fwd = ri->tgt_value;
 -      int err = 0;
 -
 -      ri->tgt_index = 0;
 -      ri->tgt_value = NULL;
 -      WRITE_ONCE(ri->map, NULL);
 -
 -      if (map->map_type == BPF_MAP_TYPE_DEVMAP ||
 -          map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
 -              struct bpf_dtab_netdev *dst = fwd;
 +      int err;
  
 -              err = dev_map_generic_redirect(dst, skb, xdp_prog);
 +      switch (map_type) {
 +      case BPF_MAP_TYPE_DEVMAP:
 +              fallthrough;
 +      case BPF_MAP_TYPE_DEVMAP_HASH:
 +              err = dev_map_generic_redirect(fwd, skb, xdp_prog);
                if (unlikely(err))
                        goto err;
 -      } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) {
 -              struct xdp_sock *xs = fwd;
 -
 -              err = xsk_generic_rcv(xs, xdp);
 +              break;
 +      case BPF_MAP_TYPE_XSKMAP:
 +              err = xsk_generic_rcv(fwd, xdp);
                if (err)
                        goto err;
                consume_skb(skb);
 -      } else {
 +              break;
 +      default:
                /* TODO: Handle BPF_MAP_TYPE_CPUMAP */
                err = -EBADRQC;
                goto err;
        }
  
 -      _trace_xdp_redirect_map(dev, xdp_prog, fwd, mapindex);
 +      _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index);
        return 0;
  err:
 -      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, mapindex, err);
 +      _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err);
        return err;
  }
  
@@@ -4017,34 -4059,31 +4017,34 @@@ int xdp_do_generic_redirect(struct net_
                            struct xdp_buff *xdp, struct bpf_prog *xdp_prog)
  {
        struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 -      struct bpf_map *map = READ_ONCE(ri->map);
 -      u32 index = ri->tgt_index;
 -      struct net_device *fwd;
 -      int err = 0;
 -
 -      if (map)
 -              return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
 -                                                 map);
 -      ri->tgt_index = 0;
 -      fwd = dev_get_by_index_rcu(dev_net(dev), index);
 -      if (unlikely(!fwd)) {
 -              err = -EINVAL;
 -              goto err;
 -      }
 +      enum bpf_map_type map_type = ri->map_type;
 +      void *fwd = ri->tgt_value;
 +      u32 map_id = ri->map_id;
 +      int err;
  
 -      err = xdp_ok_fwd_dev(fwd, skb->len);
 -      if (unlikely(err))
 -              goto err;
 +      ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */
 +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
  
 -      skb->dev = fwd;
 -      _trace_xdp_redirect(dev, xdp_prog, index);
 -      generic_xdp_tx(skb, xdp_prog);
 -      return 0;
 +      if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) {
 +              fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index);
 +              if (unlikely(!fwd)) {
 +                      err = -EINVAL;
 +                      goto err;
 +              }
 +
 +              err = xdp_ok_fwd_dev(fwd, skb->len);
 +              if (unlikely(err))
 +                      goto err;
 +
 +              skb->dev = fwd;
 +              _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index);
 +              generic_xdp_tx(skb, xdp_prog);
 +              return 0;
 +      }
 +
 +      return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id);
  err:
 -      _trace_xdp_redirect_err(dev, xdp_prog, index, err);
 +      _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err);
        return err;
  }
  
@@@ -4055,12 -4094,10 +4055,12 @@@ BPF_CALL_2(bpf_xdp_redirect, u32, ifind
        if (unlikely(flags))
                return XDP_ABORTED;
  
 -      ri->flags = flags;
 +      /* NB! Map type UNSPEC and map_id == INT_MAX (never generated
 +       * by map_idr) is used for ifindex based XDP redirect.
 +       */
        ri->tgt_index = ifindex;
 -      ri->tgt_value = NULL;
 -      WRITE_ONCE(ri->map, NULL);
 +      ri->map_id = INT_MAX;
 +      ri->map_type = BPF_MAP_TYPE_UNSPEC;
  
        return XDP_REDIRECT;
  }
@@@ -4076,7 -4113,28 +4076,7 @@@ static const struct bpf_func_proto bpf_
  BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
           u64, flags)
  {
 -      struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
 -
 -      /* Lower bits of the flags are used as return code on lookup failure */
 -      if (unlikely(flags > XDP_TX))
 -              return XDP_ABORTED;
 -
 -      ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
 -      if (unlikely(!ri->tgt_value)) {
 -              /* If the lookup fails we want to clear out the state in the
 -               * redirect_info struct completely, so that if an eBPF program
 -               * performs multiple lookups, the last one always takes
 -               * precedence.
 -               */
 -              WRITE_ONCE(ri->map, NULL);
 -              return flags;
 -      }
 -
 -      ri->flags = flags;
 -      ri->tgt_index = ifindex;
 -      WRITE_ONCE(ri->map, map);
 -
 -      return XDP_REDIRECT;
 +      return map->ops->map_redirect(map, ifindex, flags);
  }
  
  static const struct bpf_func_proto bpf_xdp_redirect_map_proto = {
@@@ -5600,7 -5658,7 +5600,7 @@@ BPF_CALL_5(bpf_skb_check_mtu, struct sk
        if (unlikely(flags & ~(BPF_MTU_CHK_SEGS)))
                return -EINVAL;
  
-       if (unlikely(flags & BPF_MTU_CHK_SEGS && len_diff))
+       if (unlikely(flags & BPF_MTU_CHK_SEGS && (len_diff || *mtu_len)))
                return -EINVAL;
  
        dev = __dev_via_ifindex(dev, ifindex);
        mtu = READ_ONCE(dev->mtu);
  
        dev_len = mtu + dev->hard_header_len;
-       skb_len = skb->len + len_diff; /* minus result pass check */
+       /* If set use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       skb_len = *mtu_len ? *mtu_len + dev->hard_header_len : skb->len;
+       skb_len += len_diff; /* minus result pass check */
        if (skb_len <= dev_len) {
                ret = BPF_MTU_CHK_RET_SUCCESS;
                goto out;
@@@ -5655,6 -5717,10 +5659,10 @@@ BPF_CALL_5(bpf_xdp_check_mtu, struct xd
        /* Add L2-header as dev MTU is L3 size */
        dev_len = mtu + dev->hard_header_len;
  
+       /* Use *mtu_len as input, L3 as iph->tot_len (like fib_lookup) */
+       if (*mtu_len)
+               xdp_len = *mtu_len + dev->hard_header_len;
        xdp_len += len_diff; /* minus result pass check */
        if (xdp_len > dev_len)
                ret = BPF_MTU_CHK_RET_FRAG_NEEDED;
@@@ -9597,40 -9663,22 +9605,40 @@@ static u32 sock_ops_convert_ctx_access(
        return insn - insn_buf;
  }
  
 +/* data_end = skb->data + skb_headlen() */
 +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si,
 +                                                  struct bpf_insn *insn)
 +{
 +      /* si->dst_reg = skb->data */
 +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data),
 +                            si->dst_reg, si->src_reg,
 +                            offsetof(struct sk_buff, data));
 +      /* AX = skb->len */
 +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len),
 +                            BPF_REG_AX, si->src_reg,
 +                            offsetof(struct sk_buff, len));
 +      /* si->dst_reg = skb->data + skb->len */
 +      *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX);
 +      /* AX = skb->data_len */
 +      *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len),
 +                            BPF_REG_AX, si->src_reg,
 +                            offsetof(struct sk_buff, data_len));
 +      /* si->dst_reg = skb->data + skb->len - skb->data_len */
 +      *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX);
 +
 +      return insn;
 +}
 +
  static u32 sk_skb_convert_ctx_access(enum bpf_access_type type,
                                     const struct bpf_insn *si,
                                     struct bpf_insn *insn_buf,
                                     struct bpf_prog *prog, u32 *target_size)
  {
        struct bpf_insn *insn = insn_buf;
 -      int off;
  
        switch (si->off) {
        case offsetof(struct __sk_buff, data_end):
 -              off  = si->off;
 -              off -= offsetof(struct __sk_buff, data_end);
 -              off += offsetof(struct sk_buff, cb);
 -              off += offsetof(struct tcp_skb_cb, bpf.data_end);
 -              *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg,
 -                                    si->src_reg, off);
 +              insn = bpf_convert_data_end_access(si, insn);
                break;
        default:
                return bpf_convert_ctx_access(type, si, insn_buf, prog,
@@@ -10409,7 -10457,6 +10417,7 @@@ static u32 sk_lookup_convert_ctx_access
  }
  
  const struct bpf_prog_ops sk_lookup_prog_ops = {
 +      .test_run = bpf_prog_test_run_sk_lookup,
  };
  
  const struct bpf_verifier_ops sk_lookup_verifier_ops = {
@@@ -114,7 -114,7 +114,7 @@@ int flow_dissector_bpf_prog_attach_chec
   * is the protocol port offset returned from proto_ports_offset
   */
  __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
 -                          void *data, int hlen)
 +                          const void *data, int hlen)
  {
        int poff = proto_ports_offset(ip_proto);
  
@@@ -161,7 -161,7 +161,7 @@@ static bool icmp_has_id(u8 type
   */
  void skb_flow_get_icmp_tci(const struct sk_buff *skb,
                           struct flow_dissector_key_icmp *key_icmp,
 -                         void *data, int thoff, int hlen)
 +                         const void *data, int thoff, int hlen)
  {
        struct icmphdr *ih, _ih;
  
         * avoid confusion with packets without such field
         */
        if (icmp_has_id(ih->type))
-               key_icmp->id = ih->un.echo.id ? : 1;
+               key_icmp->id = ih->un.echo.id ? ntohs(ih->un.echo.id) : 1;
        else
                key_icmp->id = 0;
  }
@@@ -187,8 -187,8 +187,8 @@@ EXPORT_SYMBOL(skb_flow_get_icmp_tci)
   */
  static void __skb_flow_dissect_icmp(const struct sk_buff *skb,
                                    struct flow_dissector *flow_dissector,
 -                                  void *target_container,
 -                                  void *data, int thoff, int hlen)
 +                                  void *target_container, const void *data,
 +                                  int thoff, int hlen)
  {
        struct flow_dissector_key_icmp *key_icmp;
  
@@@ -409,8 -409,8 +409,8 @@@ EXPORT_SYMBOL(skb_flow_dissect_hash)
  static enum flow_dissect_ret
  __skb_flow_dissect_mpls(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
 -                      void *target_container, void *data, int nhoff, int hlen,
 -                      int lse_index, bool *entropy_label)
 +                      void *target_container, const void *data, int nhoff,
 +                      int hlen, int lse_index, bool *entropy_label)
  {
        struct mpls_label *hdr, _hdr;
        u32 entry, label, bos;
  static enum flow_dissect_ret
  __skb_flow_dissect_arp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
 -                     void *target_container, void *data, int nhoff, int hlen)
 +                     void *target_container, const void *data,
 +                     int nhoff, int hlen)
  {
        struct flow_dissector_key_arp *key_arp;
        struct {
@@@ -524,7 -523,7 +524,7 @@@ static enum flow_dissect_re
  __skb_flow_dissect_gre(const struct sk_buff *skb,
                       struct flow_dissector_key_control *key_control,
                       struct flow_dissector *flow_dissector,
 -                     void *target_container, void *data,
 +                     void *target_container, const void *data,
                       __be16 *p_proto, int *p_nhoff, int *p_hlen,
                       unsigned int flags)
  {
  static enum flow_dissect_ret
  __skb_flow_dissect_batadv(const struct sk_buff *skb,
                          struct flow_dissector_key_control *key_control,
 -                        void *data, __be16 *p_proto, int *p_nhoff, int hlen,
 -                        unsigned int flags)
 +                        const void *data, __be16 *p_proto, int *p_nhoff,
 +                        int hlen, unsigned int flags)
  {
        struct {
                struct batadv_unicast_packet batadv_unicast;
  static void
  __skb_flow_dissect_tcp(const struct sk_buff *skb,
                       struct flow_dissector *flow_dissector,
 -                     void *target_container, void *data, int thoff, int hlen)
 +                     void *target_container, const void *data,
 +                     int thoff, int hlen)
  {
        struct flow_dissector_key_tcp *key_tcp;
        struct tcphdr *th, _th;
  static void
  __skb_flow_dissect_ports(const struct sk_buff *skb,
                         struct flow_dissector *flow_dissector,
 -                       void *target_container, void *data, int nhoff,
 -                       u8 ip_proto, int hlen)
 +                       void *target_container, const void *data,
 +                       int nhoff, u8 ip_proto, int hlen)
  {
        enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX;
        struct flow_dissector_key_ports *key_ports;
  static void
  __skb_flow_dissect_ipv4(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
 -                      void *target_container, void *data, const struct iphdr *iph)
 +                      void *target_container, const void *data,
 +                      const struct iphdr *iph)
  {
        struct flow_dissector_key_ip *key_ip;
  
  static void
  __skb_flow_dissect_ipv6(const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
 -                      void *target_container, void *data, const struct ipv6hdr *iph)
 +                      void *target_container, const void *data,
 +                      const struct ipv6hdr *iph)
  {
        struct flow_dissector_key_ip *key_ip;
  
@@@ -912,8 -908,9 +912,8 @@@ bool bpf_flow_dissect(struct bpf_prog *
  bool __skb_flow_dissect(const struct net *net,
                        const struct sk_buff *skb,
                        struct flow_dissector *flow_dissector,
 -                      void *target_container,
 -                      void *data, __be16 proto, int nhoff, int hlen,
 -                      unsigned int flags)
 +                      void *target_container, const void *data,
 +                      __be16 proto, int nhoff, int hlen, unsigned int flags)
  {
        struct flow_dissector_key_control *key_control;
        struct flow_dissector_key_basic *key_basic;
@@@ -1645,7 -1642,7 +1645,7 @@@ __u32 skb_get_hash_perturb(const struc
  }
  EXPORT_SYMBOL(skb_get_hash_perturb);
  
 -u32 __skb_get_poff(const struct sk_buff *skb, void *data,
 +u32 __skb_get_poff(const struct sk_buff *skb, const void *data,
                   const struct flow_keys_basic *keys, int hlen)
  {
        u32 poff = keys->control.thoff;
diff --combined net/ipv4/route.c
@@@ -21,7 -21,7 +21,7 @@@
   *            Alan Cox        :       Added BSD route gw semantics
   *            Alan Cox        :       Super /proc >4K
   *            Alan Cox        :       MTU in route table
 - *            Alan Cox        :       MSS actually. Also added the window
 + *            Alan Cox        :       MSS actually. Also added the window
   *                                    clamper.
   *            Sam Lantinga    :       Fixed route matching in rt_del()
   *            Alan Cox        :       Routing cache support.
@@@ -41,7 -41,7 +41,7 @@@
   *            Olaf Erb        :       irtt wasn't being copied right.
   *            Bjorn Ekwall    :       Kerneld route support.
   *            Alan Cox        :       Multicast fixed (I hope)
 - *            Pavel Krauz     :       Limited broadcast fixed
 + *            Pavel Krauz     :       Limited broadcast fixed
   *            Mike McLagan    :       Routing by source
   *    Alexey Kuznetsov        :       End of old history. Split to fib.c and
   *                                    route.c and rewritten from scratch.
@@@ -54,8 -54,8 +54,8 @@@
   *    Robert Olsson           :       Added rt_cache statistics
   *    Arnaldo C. Melo         :       Convert proc stuff to seq_file
   *    Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
 - *    Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 - *    Ilia Sotnikov           :       Removed TOS from hash calculations
 + *    Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
 + *    Ilia Sotnikov           :       Removed TOS from hash calculations
   */
  
  #define pr_fmt(fmt) "IPv4: " fmt
@@@ -66,7 -66,6 +66,7 @@@
  #include <linux/types.h>
  #include <linux/kernel.h>
  #include <linux/mm.h>
 +#include <linux/memblock.h>
  #include <linux/string.h>
  #include <linux/socket.h>
  #include <linux/sockios.h>
@@@ -235,6 -234,19 +235,6 @@@ static const struct seq_operations rt_c
        .show   = rt_cache_seq_show,
  };
  
 -static int rt_cache_seq_open(struct inode *inode, struct file *file)
 -{
 -      return seq_open(file, &rt_cache_seq_ops);
 -}
 -
 -static const struct proc_ops rt_cache_proc_ops = {
 -      .proc_open      = rt_cache_seq_open,
 -      .proc_read      = seq_read,
 -      .proc_lseek     = seq_lseek,
 -      .proc_release   = seq_release,
 -};
 -
 -
  static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
  {
        int cpu;
@@@ -312,6 -324,19 +312,6 @@@ static const struct seq_operations rt_c
        .show   = rt_cpu_seq_show,
  };
  
 -
 -static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 -{
 -      return seq_open(file, &rt_cpu_seq_ops);
 -}
 -
 -static const struct proc_ops rt_cpu_proc_ops = {
 -      .proc_open      = rt_cpu_seq_open,
 -      .proc_read      = seq_read,
 -      .proc_lseek     = seq_lseek,
 -      .proc_release   = seq_release,
 -};
 -
  #ifdef CONFIG_IP_ROUTE_CLASSID
  static int rt_acct_proc_show(struct seq_file *m, void *v)
  {
@@@ -342,13 -367,13 +342,13 @@@ static int __net_init ip_rt_do_proc_ini
  {
        struct proc_dir_entry *pde;
  
 -      pde = proc_create("rt_cache", 0444, net->proc_net,
 -                        &rt_cache_proc_ops);
 +      pde = proc_create_seq("rt_cache", 0444, net->proc_net,
 +                            &rt_cache_seq_ops);
        if (!pde)
                goto err1;
  
 -      pde = proc_create("rt_cache", 0444,
 -                        net->proc_net_stat, &rt_cpu_proc_ops);
 +      pde = proc_create_seq("rt_cache", 0444, net->proc_net_stat,
 +                            &rt_cpu_seq_ops);
        if (!pde)
                goto err2;
  
@@@ -453,10 -478,8 +453,10 @@@ static void ipv4_confirm_neigh(const st
        __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
  }
  
 -#define IP_IDENTS_SZ 2048u
 -
 +/* Hash tables of size 2048..262144 depending on RAM size.
 + * Each bucket uses 8 bytes.
 + */
 +static u32 ip_idents_mask __read_mostly;
  static atomic_t *ip_idents __read_mostly;
  static u32 *ip_tstamps __read_mostly;
  
   */
  u32 ip_idents_reserve(u32 hash, int segs)
  {
 -      u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 -      atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 -      u32 old = READ_ONCE(*p_tstamp);
 -      u32 now = (u32)jiffies;
 +      u32 bucket, old, now = (u32)jiffies;
 +      atomic_t *p_id;
 +      u32 *p_tstamp;
        u32 delta = 0;
  
 +      bucket = hash & ip_idents_mask;
 +      p_tstamp = ip_tstamps + bucket;
 +      p_id = ip_idents + bucket;
 +      old = READ_ONCE(*p_tstamp);
 +
        if (old != now && cmpxchg(p_tstamp, old, now) == old)
                delta = prandom_u32_max(now - old);
  
@@@ -703,7 -722,6 +703,7 @@@ static void update_or_create_fnhe(struc
  
                for_each_possible_cpu(i) {
                        struct rtable __rcu **prt;
 +
                        prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
                        rt = rcu_dereference(*prt);
                        if (rt)
@@@ -1240,12 -1258,12 +1240,12 @@@ static int ip_rt_bug(struct net *net, s
  }
  
  /*
 -   We do not cache source address of outgoing interface,
 -   because it is used only by IP RR, TS and SRR options,
 -   so that it out of fast path.
 -
 -   BTW remember: "addr" is allowed to be not aligned
 -   in IP options!
 + * We do not cache source address of outgoing interface,
 + * because it is used only by IP RR, TS and SRR options,
 + * so that it out of fast path.
 + *
 + * BTW remember: "addr" is allowed to be not aligned
 + * in IP options!
   */
  
  void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
@@@ -2090,7 -2108,7 +2090,7 @@@ static int ip_route_input_slow(struct s
                goto out;
  
        /* Check for the most weird martians, which can be not detected
 -         by fib_lookup.
 +       * by fib_lookup.
         */
  
        tun_info = skb_tunnel_info(skb);
@@@ -2228,7 -2246,7 +2228,7 @@@ local_input
        if (res->type == RTN_UNREACHABLE) {
                rth->dst.input= ip_error;
                rth->dst.error= -err;
 -              rth->rt_flags   &= ~RTCF_LOCAL;
 +              rth->rt_flags   &= ~RTCF_LOCAL;
        }
  
        if (do_cache) {
@@@ -2299,15 -2317,15 +2299,15 @@@ int ip_route_input_rcu(struct sk_buff *
                       u8 tos, struct net_device *dev, struct fib_result *res)
  {
        /* Multicast recognition logic is moved from route cache to here.
 -         The problem was that too many Ethernet cards have broken/missing
 -         hardware multicast filters :-( As result the host on multicasting
 -         network acquires a lot of useless route cache entries, sort of
 -         SDR messages from all the world. Now we try to get rid of them.
 -         Really, provided software IP multicast filter is organized
 -         reasonably (at least, hashed), it does not result in a slowdown
 -         comparing with route cache reject entries.
 -         Note, that multicast routers are not affected, because
 -         route cache entry is created eventually.
 +       * The problem was that too many Ethernet cards have broken/missing
 +       * hardware multicast filters :-( As result the host on multicasting
 +       * network acquires a lot of useless route cache entries, sort of
 +       * SDR messages from all the world. Now we try to get rid of them.
 +       * Really, provided software IP multicast filter is organized
 +       * reasonably (at least, hashed), it does not result in a slowdown
 +       * comparing with route cache reject entries.
 +       * Note, that multicast routers are not affected, because
 +       * route cache entry is created eventually.
         */
        if (ipv4_is_multicast(daddr)) {
                struct in_device *in_dev = __in_dev_get_rcu(dev);
@@@ -2519,11 -2537,11 +2519,11 @@@ struct rtable *ip_route_output_key_hash
                rth = ERR_PTR(-ENETUNREACH);
  
                /* I removed check for oif == dev_out->oif here.
 -                 It was wrong for two reasons:
 -                 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
 -                    is assigned to multiple interfaces.
 -                 2. Moreover, we are allowed to send packets with saddr
 -                    of another iface. --ANK
 +               * It was wrong for two reasons:
 +               * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
 +               *    is assigned to multiple interfaces.
 +               * 2. Moreover, we are allowed to send packets with saddr
 +               *    of another iface. --ANK
                 */
  
                if (fl4->flowi4_oif == 0 &&
                                goto out;
  
                        /* Special hack: user can direct multicasts
 -                         and limited broadcast via necessary interface
 -                         without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
 -                         This hack is not just for fun, it allows
 -                         vic,vat and friends to work.
 -                         They bind socket to loopback, set ttl to zero
 -                         and expect that it will work.
 -                         From the viewpoint of routing cache they are broken,
 -                         because we are not allowed to build multicast path
 -                         with loopback source addr (look, routing cache
 -                         cannot know, that ttl is zero, so that packet
 -                         will not leave this host and route is valid).
 -                         Luckily, this hack is good workaround.
 +                       * and limited broadcast via necessary interface
 +                       * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
 +                       * This hack is not just for fun, it allows
 +                       * vic,vat and friends to work.
 +                       * They bind socket to loopback, set ttl to zero
 +                       * and expect that it will work.
 +                       * From the viewpoint of routing cache they are broken,
 +                       * because we are not allowed to build multicast path
 +                       * with loopback source addr (look, routing cache
 +                       * cannot know, that ttl is zero, so that packet
 +                       * will not leave this host and route is valid).
 +                       * Luckily, this hack is good workaround.
                         */
  
                        fl4->flowi4_oif = dev_out->ifindex;
                    (ipv4_is_multicast(fl4->daddr) ||
                    !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
                        /* Apparently, routing tables are wrong. Assume,
 -                         that the destination is on link.
 -
 -                         WHY? DW.
 -                         Because we are allowed to send to iface
 -                         even if it has NO routes and NO assigned
 -                         addresses. When oif is specified, routing
 -                         tables are looked up with only one purpose:
 -                         to catch if destination is gatewayed, rather than
 -                         direct. Moreover, if MSG_DONTROUTE is set,
 -                         we send packet, ignoring both routing tables
 -                         and ifaddr state. --ANK
 -
 -
 -                         We could make it even if oif is unknown,
 -                         likely IPv6, but we do not.
 +                       * that the destination is on link.
 +                       *
 +                       * WHY? DW.
 +                       * Because we are allowed to send to iface
 +                       * even if it has NO routes and NO assigned
 +                       * addresses. When oif is specified, routing
 +                       * tables are looked up with only one purpose:
 +                       * to catch if destination is gatewayed, rather than
 +                       * direct. Moreover, if MSG_DONTROUTE is set,
 +                       * we send packet, ignoring both routing tables
 +                       * and ifaddr state. --ANK
 +                       *
 +                       *
 +                       * We could make it even if oif is unknown,
 +                       * likely IPv6, but we do not.
                         */
  
                        if (fl4->saddr == 0)
        return rth;
  }
  
- static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
- {
-       return NULL;
- }
- static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
- {
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-       return mtu ? : dst->dev->mtu;
- }
- static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                         struct sk_buff *skb, u32 mtu,
-                                         bool confirm_neigh)
- {
- }
- static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                      struct sk_buff *skb)
- {
- }
- static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
-                                         unsigned long old)
- {
-       return NULL;
- }
  static struct dst_ops ipv4_dst_blackhole_ops = {
-       .family                 =       AF_INET,
-       .check                  =       ipv4_blackhole_dst_check,
-       .mtu                    =       ipv4_blackhole_mtu,
-       .default_advmss         =       ipv4_default_advmss,
-       .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-       .redirect               =       ipv4_rt_blackhole_redirect,
-       .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
-       .neigh_lookup           =       ipv4_neigh_lookup,
+       .family                 = AF_INET,
+       .default_advmss         = ipv4_default_advmss,
+       .neigh_lookup           = ipv4_neigh_lookup,
+       .check                  = dst_blackhole_check,
+       .cow_metrics            = dst_blackhole_cow_metrics,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
  };
  
  struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
@@@ -3564,25 -3553,18 +3535,25 @@@ struct ip_rt_acct __percpu *ip_rt_acct 
  
  int __init ip_rt_init(void)
  {
 +      void *idents_hash;
        int cpu;
  
 -      ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
 -                                GFP_KERNEL);
 -      if (!ip_idents)
 -              panic("IP: failed to allocate ip_idents\n");
 +      /* For modern hosts, this will use 2 MB of memory */
 +      idents_hash = alloc_large_system_hash("IP idents",
 +                                            sizeof(*ip_idents) + sizeof(*ip_tstamps),
 +                                            0,
 +                                            16, /* one bucket per 64 KB */
 +                                            HASH_ZERO,
 +                                            NULL,
 +                                            &ip_idents_mask,
 +                                            2048,
 +                                            256*1024);
 +
 +      ip_idents = idents_hash;
  
 -      prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
 +      prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
  
 -      ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
 -      if (!ip_tstamps)
 -              panic("IP: failed to allocate ip_tstamps\n");
 +      ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
  
        for_each_possible_cpu(cpu) {
                struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
diff --combined net/ipv6/route.c
@@@ -260,34 -260,16 +260,16 @@@ static struct dst_ops ip6_dst_ops_templ
        .confirm_neigh          =       ip6_confirm_neigh,
  };
  
- static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
- {
-       unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
-       return mtu ? : dst->dev->mtu;
- }
- static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
-                                        struct sk_buff *skb, u32 mtu,
-                                        bool confirm_neigh)
- {
- }
- static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
-                                     struct sk_buff *skb)
- {
- }
  static struct dst_ops ip6_dst_blackhole_ops = {
-       .family                 =       AF_INET6,
-       .destroy                =       ip6_dst_destroy,
-       .check                  =       ip6_dst_check,
-       .mtu                    =       ip6_blackhole_mtu,
-       .default_advmss         =       ip6_default_advmss,
-       .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
-       .redirect               =       ip6_rt_blackhole_redirect,
-       .cow_metrics            =       dst_cow_metrics_generic,
-       .neigh_lookup           =       ip6_dst_neigh_lookup,
+       .family                 = AF_INET6,
+       .default_advmss         = ip6_default_advmss,
+       .neigh_lookup           = ip6_dst_neigh_lookup,
+       .check                  = ip6_dst_check,
+       .destroy                = ip6_dst_destroy,
+       .cow_metrics            = dst_cow_metrics_generic,
+       .update_pmtu            = dst_blackhole_update_pmtu,
+       .redirect               = dst_blackhole_redirect,
+       .mtu                    = dst_blackhole_mtu,
  };
  
  static const u32 ip6_template_metrics[RTAX_MAX] = {
@@@ -2378,7 -2360,7 +2360,7 @@@ u32 rt6_multipath_hash(const struct ne
  
                        memset(&hash_keys, 0, sizeof(hash_keys));
  
 -                        if (!flkeys) {
 +                      if (!flkeys) {
                                skb_flow_dissect_flow_keys(skb, &keys, flag);
                                flkeys = &keys;
                        }
@@@ -2518,20 -2500,20 +2500,20 @@@ struct dst_entry *ip6_route_output_flag
                                         struct flowi6 *fl6,
                                         int flags)
  {
 -        struct dst_entry *dst;
 -        struct rt6_info *rt6;
 +      struct dst_entry *dst;
 +      struct rt6_info *rt6;
  
 -        rcu_read_lock();
 -        dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
 -        rt6 = (struct rt6_info *)dst;
 -        /* For dst cached in uncached_list, refcnt is already taken. */
 -        if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
 -                dst = &net->ipv6.ip6_null_entry->dst;
 -                dst_hold(dst);
 -        }
 -        rcu_read_unlock();
 +      rcu_read_lock();
 +      dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
 +      rt6 = (struct rt6_info *)dst;
 +      /* For dst cached in uncached_list, refcnt is already taken. */
 +      if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
 +              dst = &net->ipv6.ip6_null_entry->dst;
 +              dst_hold(dst);
 +      }
 +      rcu_read_unlock();
  
 -        return dst;
 +      return dst;
  }
  EXPORT_SYMBOL_GPL(ip6_route_output_flags);
  
diff --combined net/mptcp/options.c
@@@ -26,7 -26,6 +26,7 @@@ static void mptcp_parse_option(const st
        int expected_opsize;
        u8 version;
        u8 flags;
 +      u8 i;
  
        switch (subtype) {
        case MPTCPOPT_MP_CAPABLE:
                break;
  
        case MPTCPOPT_RM_ADDR:
 -              if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE)
 +              if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
 +                  opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
                        break;
  
                ptr++;
  
                mp_opt->rm_addr = 1;
 -              mp_opt->rm_id = *ptr++;
 -              pr_debug("RM_ADDR: id=%d", mp_opt->rm_id);
 +              mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
 +              for (i = 0; i < mp_opt->rm_list.nr; i++)
 +                      mp_opt->rm_list.ids[i] = *ptr++;
 +              pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
                break;
  
        case MPTCPOPT_MP_PRIO:
@@@ -571,15 -567,15 +571,15 @@@ static bool mptcp_established_options_d
  }
  
  static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                 struct in_addr *addr)
+                                 struct in_addr *addr, u16 port)
  {
        u8 hmac[SHA256_DIGEST_SIZE];
        u8 msg[7];
  
        msg[0] = addr_id;
        memcpy(&msg[1], &addr->s_addr, 4);
-       msg[5] = 0;
-       msg[6] = 0;
+       msg[5] = port >> 8;
+       msg[6] = port & 0xFF;
  
        mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
  
  
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
-                                  struct in6_addr *addr)
+                                  struct in6_addr *addr, u16 port)
  {
        u8 hmac[SHA256_DIGEST_SIZE];
        u8 msg[19];
  
        msg[0] = addr_id;
        memcpy(&msg[1], &addr->s6_addr, 16);
-       msg[17] = 0;
-       msg[18] = 0;
+       msg[17] = port >> 8;
+       msg[18] = port & 0xFF;
  
        mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
  
@@@ -650,7 -646,8 +650,8 @@@ static bool mptcp_established_options_a
                        opts->ahmac = add_addr_generate_hmac(msk->local_key,
                                                             msk->remote_key,
                                                             opts->addr_id,
-                                                            &opts->addr);
+                                                            &opts->addr,
+                                                            opts->port);
                }
        }
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
                        opts->ahmac = add_addr6_generate_hmac(msk->local_key,
                                                              msk->remote_key,
                                                              opts->addr_id,
-                                                             &opts->addr6);
+                                                             &opts->addr6,
+                                                             opts->port);
                }
        }
  #endif
@@@ -678,25 -676,20 +680,25 @@@ static bool mptcp_established_options_r
  {
        struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
        struct mptcp_sock *msk = mptcp_sk(subflow->conn);
 -      u8 rm_id;
 +      struct mptcp_rm_list rm_list;
 +      int i, len;
  
        if (!mptcp_pm_should_rm_signal(msk) ||
 -          !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_id)))
 +          !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
                return false;
  
 -      if (remaining < TCPOLEN_MPTCP_RM_ADDR_BASE)
 +      len = mptcp_rm_addr_len(&rm_list);
 +      if (len < 0)
 +              return false;
 +      if (remaining < len)
                return false;
  
 -      *size = TCPOLEN_MPTCP_RM_ADDR_BASE;
 +      *size = len;
        opts->suboptions |= OPTION_MPTCP_RM_ADDR;
 -      opts->rm_id = rm_id;
 +      opts->rm_list = rm_list;
  
 -      pr_debug("rm_id=%d", opts->rm_id);
 +      for (i = 0; i < opts->rm_list.nr; i++)
 +              pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
  
        return true;
  }
@@@ -971,12 -964,14 +973,14 @@@ static bool add_addr_hmac_valid(struct 
        if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
                hmac = add_addr_generate_hmac(msk->remote_key,
                                              msk->local_key,
-                                             mp_opt->addr_id, &mp_opt->addr);
+                                             mp_opt->addr_id, &mp_opt->addr,
+                                             mp_opt->port);
  #if IS_ENABLED(CONFIG_MPTCP_IPV6)
        else
                hmac = add_addr6_generate_hmac(msk->remote_key,
                                               msk->local_key,
-                                              mp_opt->addr_id, &mp_opt->addr6);
+                                              mp_opt->addr_id, &mp_opt->addr6,
+                                              mp_opt->port);
  #endif
  
        pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
@@@ -1047,7 -1042,7 +1051,7 @@@ void mptcp_incoming_options(struct soc
        }
  
        if (mp_opt.rm_addr) {
 -              mptcp_pm_rm_addr_received(msk, mp_opt.rm_id);
 +              mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
                mp_opt.rm_addr = 0;
        }
  
@@@ -1226,23 -1221,9 +1230,23 @@@ mp_capable_done
        }
  
        if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
 +              u8 i = 1;
 +
                *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
 -                                    TCPOLEN_MPTCP_RM_ADDR_BASE,
 -                                    0, opts->rm_id);
 +                                    TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
 +                                    0, opts->rm_list.ids[0]);
 +
 +              while (i < opts->rm_list.nr) {
 +                      u8 id1, id2, id3, id4;
 +
 +                      id1 = opts->rm_list.ids[i];
 +                      id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
 +                      id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
 +                      id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
 +                      put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
 +                      ptr += 1;
 +                      i += 4;
 +              }
        }
  
        if (OPTION_MPTCP_PRIO & opts->suboptions) {
@@@ -79,8 -79,11 +79,8 @@@ static int flow_offload_fill_route(stru
                                   enum flow_offload_tuple_dir dir)
  {
        struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
 -      struct dst_entry *other_dst = route->tuple[!dir].dst;
        struct dst_entry *dst = route->tuple[dir].dst;
 -
 -      if (!dst_hold_safe(route->tuple[dir].dst))
 -              return -1;
 +      int i, j = 0;
  
        switch (flow_tuple->l3proto) {
        case NFPROTO_IPV4:
                break;
        }
  
 -      flow_tuple->iifidx = other_dst->dev->ifindex;
 -      flow_tuple->dst_cache = dst;
 +      flow_tuple->iifidx = route->tuple[dir].in.ifindex;
 +      for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
 +              flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
 +              flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
 +              if (route->tuple[dir].in.ingress_vlans & BIT(i))
 +                      flow_tuple->in_vlan_ingress |= BIT(j);
 +              j++;
 +      }
 +      flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
 +
 +      switch (route->tuple[dir].xmit_type) {
 +      case FLOW_OFFLOAD_XMIT_DIRECT:
 +              memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
 +                     ETH_ALEN);
 +              memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
 +                     ETH_ALEN);
 +              flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
 +              flow_tuple->out.hw_ifidx = route->tuple[dir].out.hw_ifindex;
 +              break;
 +      case FLOW_OFFLOAD_XMIT_XFRM:
 +      case FLOW_OFFLOAD_XMIT_NEIGH:
 +              if (!dst_hold_safe(route->tuple[dir].dst))
 +                      return -1;
 +
 +              flow_tuple->dst_cache = dst;
 +              break;
 +      }
 +      flow_tuple->xmit_type = route->tuple[dir].xmit_type;
  
        return 0;
  }
  
 +static void nft_flow_dst_release(struct flow_offload *flow,
 +                               enum flow_offload_tuple_dir dir)
 +{
 +      if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
 +          flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
 +              dst_release(flow->tuplehash[dir].tuple.dst_cache);
 +}
 +
  int flow_offload_route_init(struct flow_offload *flow,
                            const struct nf_flow_route *route)
  {
        return 0;
  
  err_route_reply:
 -      dst_release(route->tuple[FLOW_OFFLOAD_DIR_ORIGINAL].dst);
 +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
  
        return err;
  }
@@@ -200,8 -169,8 +200,8 @@@ static void flow_offload_fixup_ct(struc
  
  static void flow_offload_route_release(struct flow_offload *flow)
  {
 -      dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_cache);
 -      dst_release(flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_cache);
 +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
 +      nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
  }
  
  void flow_offload_free(struct flow_offload *flow)
@@@ -420,20 -389,29 +420,20 @@@ static void nf_flow_offload_work_gc(str
        queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
  }
  
 -
 -static int nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 -                              __be16 port, __be16 new_port)
 +static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
 +                               __be16 port, __be16 new_port)
  {
        struct tcphdr *tcph;
  
 -      if (skb_try_make_writable(skb, thoff + sizeof(*tcph)))
 -              return -1;
 -
        tcph = (void *)(skb_network_header(skb) + thoff);
        inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
 -
 -      return 0;
  }
  
 -static int nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
 -                              __be16 port, __be16 new_port)
 +static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
 +                               __be16 port, __be16 new_port)
  {
        struct udphdr *udph;
  
 -      if (skb_try_make_writable(skb, thoff + sizeof(*udph)))
 -              return -1;
 -
        udph = (void *)(skb_network_header(skb) + thoff);
        if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
                inet_proto_csum_replace2(&udph->check, skb, port,
                if (!udph->check)
                        udph->check = CSUM_MANGLED_0;
        }
 -
 -      return 0;
  }
  
 -static int nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
 -                          u8 protocol, __be16 port, __be16 new_port)
 +static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
 +                           u8 protocol, __be16 port, __be16 new_port)
  {
        switch (protocol) {
        case IPPROTO_TCP:
 -              if (nf_flow_nat_port_tcp(skb, thoff, port, new_port) < 0)
 -                      return NF_DROP;
 +              nf_flow_nat_port_tcp(skb, thoff, port, new_port);
                break;
        case IPPROTO_UDP:
 -              if (nf_flow_nat_port_udp(skb, thoff, port, new_port) < 0)
 -                      return NF_DROP;
 +              nf_flow_nat_port_udp(skb, thoff, port, new_port);
                break;
        }
 -
 -      return 0;
  }
  
 -int nf_flow_snat_port(const struct flow_offload *flow,
 -                    struct sk_buff *skb, unsigned int thoff,
 -                    u8 protocol, enum flow_offload_tuple_dir dir)
 +void nf_flow_snat_port(const struct flow_offload *flow,
 +                     struct sk_buff *skb, unsigned int thoff,
 +                     u8 protocol, enum flow_offload_tuple_dir dir)
  {
        struct flow_ports *hdr;
        __be16 port, new_port;
  
 -      if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
 -              return -1;
 -
        hdr = (void *)(skb_network_header(skb) + thoff);
  
        switch (dir) {
                new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
                hdr->dest = new_port;
                break;
 -      default:
 -              return -1;
        }
  
 -      return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
 +      nf_flow_nat_port(skb, thoff, protocol, port, new_port);
  }
  EXPORT_SYMBOL_GPL(nf_flow_snat_port);
  
 -int nf_flow_dnat_port(const struct flow_offload *flow,
 -                    struct sk_buff *skb, unsigned int thoff,
 -                    u8 protocol, enum flow_offload_tuple_dir dir)
 +void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
 +                     unsigned int thoff, u8 protocol,
 +                     enum flow_offload_tuple_dir dir)
  {
        struct flow_ports *hdr;
        __be16 port, new_port;
  
 -      if (skb_try_make_writable(skb, thoff + sizeof(*hdr)))
 -              return -1;
 -
        hdr = (void *)(skb_network_header(skb) + thoff);
  
        switch (dir) {
                new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
                hdr->source = new_port;
                break;
 -      default:
 -              return -1;
        }
  
 -      return nf_flow_nat_port(skb, thoff, protocol, port, new_port);
 +      nf_flow_nat_port(skb, thoff, protocol, port, new_port);
  }
  EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
  
@@@ -512,7 -506,7 +512,7 @@@ int nf_flow_table_init(struct nf_flowta
  {
        int err;
  
-       INIT_DEFERRABLE_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
+       INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
        flow_block_init(&flowtable->flow_block);
        init_rwsem(&flowtable->flow_block_lock);
  
@@@ -900,12 -900,6 +900,12 @@@ static void nf_tables_table_disable(str
        nft_table_disable(net, table, 0);
  }
  
 +enum {
 +      NFT_TABLE_STATE_UNCHANGED       = 0,
 +      NFT_TABLE_STATE_DORMANT,
 +      NFT_TABLE_STATE_WAKEUP
 +};
 +
  static int nf_tables_updtable(struct nft_ctx *ctx)
  {
        struct nft_trans *trans;
  
        if ((flags & NFT_TABLE_F_DORMANT) &&
            !(ctx->table->flags & NFT_TABLE_F_DORMANT)) {
 -              nft_trans_table_enable(trans) = false;
 +              nft_trans_table_state(trans) = NFT_TABLE_STATE_DORMANT;
        } else if (!(flags & NFT_TABLE_F_DORMANT) &&
                   ctx->table->flags & NFT_TABLE_F_DORMANT) {
 -              ctx->table->flags &= ~NFT_TABLE_F_DORMANT;
                ret = nf_tables_table_enable(ctx->net, ctx->table);
                if (ret >= 0)
 -                      nft_trans_table_enable(trans) = true;
 -              else
 -                      ctx->table->flags |= NFT_TABLE_F_DORMANT;
 +                      nft_trans_table_state(trans) = NFT_TABLE_STATE_WAKEUP;
        }
        if (ret < 0)
                goto err;
  
 +      nft_trans_table_flags(trans) = flags;
        nft_trans_table_update(trans) = true;
        list_add_tail(&trans->list, &ctx->net->nft.commit_list);
        return 0;
@@@ -6787,6 -6783,9 +6787,9 @@@ static int nft_register_flowtable_net_h
  
        list_for_each_entry(hook, hook_list, list) {
                list_for_each_entry(ft, &table->flowtables, list) {
+                       if (!nft_is_active_next(net, ft))
+                               continue;
                        list_for_each_entry(hook2, &ft->hook_list, list) {
                                if (hook->ops.dev == hook2->ops.dev &&
                                    hook->ops.pf == hook2->ops.pf) {
@@@ -6846,6 -6845,7 +6849,7 @@@ static int nft_flowtable_update(struct 
        struct nft_hook *hook, *next;
        struct nft_trans *trans;
        bool unregister = false;
+       u32 flags;
        int err;
  
        err = nft_flowtable_parse_hook(ctx, nla[NFTA_FLOWTABLE_HOOK],
                }
        }
  
+       if (nla[NFTA_FLOWTABLE_FLAGS]) {
+               flags = ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
+               if (flags & ~NFT_FLOWTABLE_MASK)
+                       return -EOPNOTSUPP;
+               if ((flowtable->data.flags & NFT_FLOWTABLE_HW_OFFLOAD) ^
+                   (flags & NFT_FLOWTABLE_HW_OFFLOAD))
+                       return -EOPNOTSUPP;
+       } else {
+               flags = flowtable->data.flags;
+       }
        err = nft_register_flowtable_net_hooks(ctx->net, ctx->table,
                                               &flowtable_hook.list, flowtable);
        if (err < 0)
                goto err_flowtable_update_hook;
        }
  
+       nft_trans_flowtable_flags(trans) = flags;
        nft_trans_flowtable(trans) = flowtable;
        nft_trans_flowtable_update(trans) = true;
        INIT_LIST_HEAD(&nft_trans_flowtable_hooks(trans));
@@@ -6967,8 -6979,10 +6983,10 @@@ static int nf_tables_newflowtable(struc
        if (nla[NFTA_FLOWTABLE_FLAGS]) {
                flowtable->data.flags =
                        ntohl(nla_get_be32(nla[NFTA_FLOWTABLE_FLAGS]));
-               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK)
+               if (flowtable->data.flags & ~NFT_FLOWTABLE_MASK) {
+                       err = -EOPNOTSUPP;
                        goto err3;
+               }
        }
  
        write_pnet(&flowtable->data.net, net);
@@@ -8072,10 -8086,11 +8090,10 @@@ static int nf_tables_commit(struct net 
                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
 -                              if (!nft_trans_table_enable(trans)) {
 -                                      nf_tables_table_disable(net,
 -                                                              trans->ctx.table);
 -                                      trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 -                              }
 +                              if (nft_trans_table_state(trans) == NFT_TABLE_STATE_DORMANT)
 +                                      nf_tables_table_disable(net, trans->ctx.table);
 +
 +                              trans->ctx.table->flags = nft_trans_table_flags(trans);
                        } else {
                                nft_clear(net, trans->ctx.table);
                        }
                        break;
                case NFT_MSG_NEWFLOWTABLE:
                        if (nft_trans_flowtable_update(trans)) {
+                               nft_trans_flowtable(trans)->data.flags =
+                                       nft_trans_flowtable_flags(trans);
                                nf_tables_flowtable_notify(&trans->ctx,
                                                           nft_trans_flowtable(trans),
                                                           &nft_trans_flowtable_hooks(trans),
@@@ -8286,9 -8303,11 +8306,9 @@@ static int __nf_tables_abort(struct ne
                switch (trans->msg_type) {
                case NFT_MSG_NEWTABLE:
                        if (nft_trans_table_update(trans)) {
 -                              if (nft_trans_table_enable(trans)) {
 -                                      nf_tables_table_disable(net,
 -                                                              trans->ctx.table);
 -                                      trans->ctx.table->flags |= NFT_TABLE_F_DORMANT;
 -                              }
 +                              if (nft_trans_table_state(trans) == NFT_TABLE_STATE_WAKEUP)
 +                                      nf_tables_table_disable(net, trans->ctx.table);
 +
                                nft_trans_destroy(trans);
                        } else {
                                list_del_rcu(&trans->ctx.table->list);
@@@ -8558,7 -8577,6 +8578,7 @@@ static int nf_tables_check_loops(const 
                                                        data->verdict.chain);
                                if (err < 0)
                                        return err;
 +                              break;
                        default:
                                break;
                        }
diff --combined net/sched/cls_api.c
@@@ -1629,6 -1629,7 +1629,7 @@@ int tcf_classify_ingress(struct sk_buf
                        return TC_ACT_SHOT;
                ext->chain = last_executed_chain;
                ext->mru = qdisc_skb_cb(skb)->mru;
+               ext->post_ct = qdisc_skb_cb(skb)->post_ct;
        }
  
        return ret;
@@@ -3661,9 -3662,6 +3662,9 @@@ int tc_setup_flow_action(struct flow_ac
                        entry->police.burst = tcf_police_burst(act);
                        entry->police.rate_bytes_ps =
                                tcf_police_rate_bytes_ps(act);
 +                      entry->police.burst_pkt = tcf_police_burst_pkt(act);
 +                      entry->police.rate_pkt_ps =
 +                              tcf_police_rate_pkt_ps(act);
                        entry->police.mtu = tcf_police_tcfp_mtu(act);
                        entry->police.index = act->tcfa_index;
                } else if (is_tcf_ct(act)) {
diff --combined net/sched/cls_flower.c
@@@ -209,16 -209,16 +209,16 @@@ static bool fl_range_port_dst_cmp(struc
                                  struct fl_flow_key *key,
                                  struct fl_flow_key *mkey)
  {
 -      __be16 min_mask, max_mask, min_val, max_val;
 +      u16 min_mask, max_mask, min_val, max_val;
  
 -      min_mask = htons(filter->mask->key.tp_range.tp_min.dst);
 -      max_mask = htons(filter->mask->key.tp_range.tp_max.dst);
 -      min_val = htons(filter->key.tp_range.tp_min.dst);
 -      max_val = htons(filter->key.tp_range.tp_max.dst);
 +      min_mask = ntohs(filter->mask->key.tp_range.tp_min.dst);
 +      max_mask = ntohs(filter->mask->key.tp_range.tp_max.dst);
 +      min_val = ntohs(filter->key.tp_range.tp_min.dst);
 +      max_val = ntohs(filter->key.tp_range.tp_max.dst);
  
        if (min_mask && max_mask) {
 -              if (htons(key->tp_range.tp.dst) < min_val ||
 -                  htons(key->tp_range.tp.dst) > max_val)
 +              if (ntohs(key->tp_range.tp.dst) < min_val ||
 +                  ntohs(key->tp_range.tp.dst) > max_val)
                        return false;
  
                /* skb does not have min and max values */
@@@ -232,16 -232,16 +232,16 @@@ static bool fl_range_port_src_cmp(struc
                                  struct fl_flow_key *key,
                                  struct fl_flow_key *mkey)
  {
 -      __be16 min_mask, max_mask, min_val, max_val;
 +      u16 min_mask, max_mask, min_val, max_val;
  
 -      min_mask = htons(filter->mask->key.tp_range.tp_min.src);
 -      max_mask = htons(filter->mask->key.tp_range.tp_max.src);
 -      min_val = htons(filter->key.tp_range.tp_min.src);
 -      max_val = htons(filter->key.tp_range.tp_max.src);
 +      min_mask = ntohs(filter->mask->key.tp_range.tp_min.src);
 +      max_mask = ntohs(filter->mask->key.tp_range.tp_max.src);
 +      min_val = ntohs(filter->key.tp_range.tp_min.src);
 +      max_val = ntohs(filter->key.tp_range.tp_max.src);
  
        if (min_mask && max_mask) {
 -              if (htons(key->tp_range.tp.src) < min_val ||
 -                  htons(key->tp_range.tp.src) > max_val)
 +              if (ntohs(key->tp_range.tp.src) < min_val ||
 +                  ntohs(key->tp_range.tp.src) > max_val)
                        return false;
  
                /* skb does not have min and max values */
@@@ -783,16 -783,16 +783,16 @@@ static int fl_set_key_port_range(struc
                       TCA_FLOWER_UNSPEC, sizeof(key->tp_range.tp_max.src));
  
        if (mask->tp_range.tp_min.dst && mask->tp_range.tp_max.dst &&
 -          htons(key->tp_range.tp_max.dst) <=
 -          htons(key->tp_range.tp_min.dst)) {
 +          ntohs(key->tp_range.tp_max.dst) <=
 +          ntohs(key->tp_range.tp_min.dst)) {
                NL_SET_ERR_MSG_ATTR(extack,
                                    tb[TCA_FLOWER_KEY_PORT_DST_MIN],
                                    "Invalid destination port range (min must be strictly smaller than max)");
                return -EINVAL;
        }
        if (mask->tp_range.tp_min.src && mask->tp_range.tp_max.src &&
 -          htons(key->tp_range.tp_max.src) <=
 -          htons(key->tp_range.tp_min.src)) {
 +          ntohs(key->tp_range.tp_max.src) <=
 +          ntohs(key->tp_range.tp_min.src)) {
                NL_SET_ERR_MSG_ATTR(extack,
                                    tb[TCA_FLOWER_KEY_PORT_SRC_MIN],
                                    "Invalid source port range (min must be strictly smaller than max)");
@@@ -1044,8 -1044,8 +1044,8 @@@ static int fl_set_key_flags(struct nlat
                return -EINVAL;
        }
  
 -      key = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS]));
 -      mask = be32_to_cpu(nla_get_u32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
 +      key = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS]));
 +      mask = be32_to_cpu(nla_get_be32(tb[TCA_FLOWER_KEY_FLAGS_MASK]));
  
        *flags_key  = 0;
        *flags_mask = 0;
@@@ -1451,7 -1451,7 +1451,7 @@@ static int fl_set_key_ct(struct nlattr 
                               &mask->ct_state, TCA_FLOWER_KEY_CT_STATE_MASK,
                               sizeof(key->ct_state));
  
-               err = fl_validate_ct_state(mask->ct_state,
+               err = fl_validate_ct_state(key->ct_state & mask->ct_state,
                                           tb[TCA_FLOWER_KEY_CT_STATE_MASK],
                                           extack);
                if (err)
diff --combined net/tipc/node.c
@@@ -372,49 -372,42 +372,49 @@@ static struct tipc_node *tipc_node_find
  }
  
  static void tipc_node_read_lock(struct tipc_node *n)
 +      __acquires(n->lock)
  {
        read_lock_bh(&n->lock);
  }
  
  static void tipc_node_read_unlock(struct tipc_node *n)
 +      __releases(n->lock)
  {
        read_unlock_bh(&n->lock);
  }
  
  static void tipc_node_write_lock(struct tipc_node *n)
 +      __acquires(n->lock)
  {
        write_lock_bh(&n->lock);
  }
  
  static void tipc_node_write_unlock_fast(struct tipc_node *n)
 +      __releases(n->lock)
  {
        write_unlock_bh(&n->lock);
  }
  
  static void tipc_node_write_unlock(struct tipc_node *n)
 +      __releases(n->lock)
  {
 +      struct tipc_socket_addr sk;
        struct net *net = n->net;
 -      u32 addr = 0;
        u32 flags = n->action_flags;
 -      u32 link_id = 0;
 -      u32 bearer_id;
        struct list_head *publ_list;
 +      struct tipc_uaddr ua;
 +      u32 bearer_id;
  
        if (likely(!flags)) {
                write_unlock_bh(&n->lock);
                return;
        }
  
 -      addr = n->addr;
 -      link_id = n->link_id;
 -      bearer_id = link_id & 0xffff;
 +      tipc_uaddr(&ua, TIPC_SERVICE_RANGE, TIPC_NODE_SCOPE,
 +                 TIPC_LINK_STATE, n->addr, n->addr);
 +      sk.ref = n->link_id;
 +      sk.node = n->addr;
 +      bearer_id = n->link_id & 0xffff;
        publ_list = &n->publ_list;
  
        n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
        write_unlock_bh(&n->lock);
  
        if (flags & TIPC_NOTIFY_NODE_DOWN)
 -              tipc_publ_notify(net, publ_list, addr, n->capabilities);
 +              tipc_publ_notify(net, publ_list, n->addr, n->capabilities);
  
        if (flags & TIPC_NOTIFY_NODE_UP)
 -              tipc_named_node_up(net, addr, n->capabilities);
 +              tipc_named_node_up(net, n->addr, n->capabilities);
  
        if (flags & TIPC_NOTIFY_LINK_UP) {
 -              tipc_mon_peer_up(net, addr, bearer_id);
 -              tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
 -                                   TIPC_NODE_SCOPE, link_id, link_id);
 +              tipc_mon_peer_up(net, n->addr, bearer_id);
 +              tipc_nametbl_publish(net, &ua, &sk, n->link_id);
        }
        if (flags & TIPC_NOTIFY_LINK_DOWN) {
 -              tipc_mon_peer_down(net, addr, bearer_id);
 -              tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
 -                                    addr, link_id);
 +              tipc_mon_peer_down(net, n->addr, bearer_id);
 +              tipc_nametbl_withdraw(net, &ua, &sk, n->link_id);
        }
  }
  
@@@ -2900,17 -2895,22 +2900,22 @@@ int tipc_nl_node_dump_monitor_peer(stru
  
  #ifdef CONFIG_TIPC_CRYPTO
  static int tipc_nl_retrieve_key(struct nlattr **attrs,
-                               struct tipc_aead_key **key)
+                               struct tipc_aead_key **pkey)
  {
        struct nlattr *attr = attrs[TIPC_NLA_NODE_KEY];
+       struct tipc_aead_key *key;
  
        if (!attr)
                return -ENODATA;
  
-       *key = (struct tipc_aead_key *)nla_data(attr);
-       if (nla_len(attr) < tipc_aead_key_size(*key))
+       if (nla_len(attr) < sizeof(*key))
+               return -EINVAL;
+       key = (struct tipc_aead_key *)nla_data(attr);
+       if (key->keylen > TIPC_AEAD_KEYLEN_MAX ||
+           nla_len(attr) < tipc_aead_key_size(key))
                return -EINVAL;
  
+       *pkey = key;
        return 0;
  }
  
diff --combined tools/lib/bpf/Makefile
@@@ -158,7 -158,7 +158,7 @@@ $(BPF_IN_STATIC): force $(BPF_HELPER_DE
        $(Q)$(MAKE) $(build)=libbpf OUTPUT=$(STATIC_OBJDIR)
  
  $(BPF_HELPER_DEFS): $(srctree)/tools/include/uapi/linux/bpf.h
 -      $(QUIET_GEN)$(srctree)/scripts/bpf_helpers_doc.py --header \
 +      $(QUIET_GEN)$(srctree)/scripts/bpf_doc.py --header \
                --file $(srctree)/tools/include/uapi/linux/bpf.h > $(BPF_HELPER_DEFS)
  
  $(OUTPUT)libbpf.so: $(OUTPUT)libbpf.so.$(LIBBPF_VERSION)
@@@ -215,7 -215,7 +215,7 @@@ define do_instal
        if [ ! -d '$(DESTDIR_SQ)$2' ]; then             \
                $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2'; \
        fi;                                             \
-       $(INSTALL) $1 $(if $3,-m $3,) '$(DESTDIR_SQ)$2'
+       $(INSTALL) $(if $3,-m $3,) $1 '$(DESTDIR_SQ)$2'
  endef
  
  install_lib: all_cmd
diff --combined tools/lib/bpf/btf_dump.c
@@@ -279,7 -279,6 +279,7 @@@ static int btf_dump_mark_referenced(str
                case BTF_KIND_INT:
                case BTF_KIND_ENUM:
                case BTF_KIND_FWD:
 +              case BTF_KIND_FLOAT:
                        break;
  
                case BTF_KIND_VOLATILE:
@@@ -454,7 -453,6 +454,7 @@@ static int btf_dump_order_type(struct b
  
        switch (btf_kind(t)) {
        case BTF_KIND_INT:
 +      case BTF_KIND_FLOAT:
                tstate->order_state = ORDERED;
                return 0;
  
                return err;
  
        case BTF_KIND_ARRAY:
-               return btf_dump_order_type(d, btf_array(t)->type, through_ptr);
+               return btf_dump_order_type(d, btf_array(t)->type, false);
  
        case BTF_KIND_STRUCT:
        case BTF_KIND_UNION: {
@@@ -1135,7 -1133,6 +1135,7 @@@ skip_mod
                case BTF_KIND_STRUCT:
                case BTF_KIND_UNION:
                case BTF_KIND_TYPEDEF:
 +              case BTF_KIND_FLOAT:
                        goto done;
                default:
                        pr_warn("unexpected type in decl chain, kind:%u, id:[%u]\n",
@@@ -1250,7 -1247,6 +1250,7 @@@ static void btf_dump_emit_type_chain(st
  
                switch (kind) {
                case BTF_KIND_INT:
 +              case BTF_KIND_FLOAT:
                        btf_dump_emit_mods(d, decls);
                        name = btf_name_of(d, t->name_off);
                        btf_dump_printf(d, "%s", name);
diff --combined tools/lib/bpf/libbpf.c
@@@ -178,8 -178,6 +178,8 @@@ enum kern_feature_id 
        FEAT_PROG_BIND_MAP,
        /* Kernel support for module BTFs */
        FEAT_MODULE_BTF,
 +      /* BTF_KIND_FLOAT support */
 +      FEAT_BTF_FLOAT,
        __FEAT_CNT,
  };
  
@@@ -190,7 -188,6 +190,7 @@@ enum reloc_type 
        RELO_CALL,
        RELO_DATA,
        RELO_EXTERN,
 +      RELO_SUBPROG_ADDR,
  };
  
  struct reloc_desc {
@@@ -577,16 -574,6 +577,16 @@@ static bool insn_is_subprog_call(const 
               insn->off == 0;
  }
  
 +static bool is_ldimm64(struct bpf_insn *insn)
 +{
 +      return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 +}
 +
 +static bool insn_is_pseudo_func(struct bpf_insn *insn)
 +{
 +      return is_ldimm64(insn) && insn->src_reg == BPF_PSEUDO_FUNC;
 +}
 +
  static int
  bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog,
                      const char *name, size_t sec_idx, const char *sec_name,
@@@ -1194,7 -1181,8 +1194,8 @@@ static int bpf_object__elf_init(struct 
        if (!elf_rawdata(elf_getscn(obj->efile.elf, obj->efile.shstrndx), NULL)) {
                pr_warn("elf: failed to get section names strings from %s: %s\n",
                        obj->path, elf_errmsg(-1));
-               return -LIBBPF_ERRNO__FORMAT;
+               err = -LIBBPF_ERRNO__FORMAT;
+               goto errout;
        }
  
        /* Old LLVM set e_machine to EM_NONE */
@@@ -1948,7 -1936,6 +1949,7 @@@ static const char *btf_kind_str(const s
        case BTF_KIND_FUNC_PROTO: return "func_proto";
        case BTF_KIND_VAR: return "var";
        case BTF_KIND_DATASEC: return "datasec";
 +      case BTF_KIND_FLOAT: return "float";
        default: return "unknown";
        }
  }
@@@ -2398,17 -2385,15 +2399,17 @@@ static bool btf_needs_sanitization(stru
  {
        bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
        bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
 +      bool has_float = kernel_supports(FEAT_BTF_FLOAT);
        bool has_func = kernel_supports(FEAT_BTF_FUNC);
  
 -      return !has_func || !has_datasec || !has_func_global;
 +      return !has_func || !has_datasec || !has_func_global || !has_float;
  }
  
  static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf)
  {
        bool has_func_global = kernel_supports(FEAT_BTF_GLOBAL_FUNC);
        bool has_datasec = kernel_supports(FEAT_BTF_DATASEC);
 +      bool has_float = kernel_supports(FEAT_BTF_FLOAT);
        bool has_func = kernel_supports(FEAT_BTF_FUNC);
        struct btf_type *t;
        int i, j, vlen;
                } else if (!has_func_global && btf_is_func(t)) {
                        /* replace BTF_FUNC_GLOBAL with BTF_FUNC_STATIC */
                        t->info = BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0);
 +              } else if (!has_float && btf_is_float(t)) {
 +                      /* replace FLOAT with an equally-sized empty STRUCT;
 +                       * since C compilers do not accept e.g. "float" as a
 +                       * valid struct name, make it anonymous
 +                       */
 +                      t->name_off = 0;
 +                      t->info = BTF_INFO_ENC(BTF_KIND_STRUCT, 0, 0);
                }
        }
  }
@@@ -2997,23 -2975,6 +2998,23 @@@ static bool sym_is_extern(const GElf_Sy
               GELF_ST_TYPE(sym->st_info) == STT_NOTYPE;
  }
  
 +static bool sym_is_subprog(const GElf_Sym *sym, int text_shndx)
 +{
 +      int bind = GELF_ST_BIND(sym->st_info);
 +      int type = GELF_ST_TYPE(sym->st_info);
 +
 +      /* in .text section */
 +      if (sym->st_shndx != text_shndx)
 +              return false;
 +
 +      /* local function */
 +      if (bind == STB_LOCAL && type == STT_SECTION)
 +              return true;
 +
 +      /* global function */
 +      return bind == STB_GLOBAL && type == STT_FUNC;
 +}
 +
  static int find_extern_btf_id(const struct btf *btf, const char *ext_name)
  {
        const struct btf_type *t;
@@@ -3435,7 -3396,7 +3436,7 @@@ static int bpf_program__record_reloc(st
                return 0;
        }
  
 -      if (insn->code != (BPF_LD | BPF_IMM | BPF_DW)) {
 +      if (!is_ldimm64(insn)) {
                pr_warn("prog '%s': invalid relo against '%s' for insns[%d].code 0x%x\n",
                        prog->name, sym_name, insn_idx, insn->code);
                return -LIBBPF_ERRNO__RELOC;
                return -LIBBPF_ERRNO__RELOC;
        }
  
 +      /* loading subprog addresses */
 +      if (sym_is_subprog(sym, obj->efile.text_shndx)) {
 +              /* global_func: sym->st_value = offset in the section, insn->imm = 0.
 +               * local_func: sym->st_value = 0, insn->imm = offset in the section.
 +               */
 +              if ((sym->st_value % BPF_INSN_SZ) || (insn->imm % BPF_INSN_SZ)) {
 +                      pr_warn("prog '%s': bad subprog addr relo against '%s' at offset %zu+%d\n",
 +                              prog->name, sym_name, (size_t)sym->st_value, insn->imm);
 +                      return -LIBBPF_ERRNO__RELOC;
 +              }
 +
 +              reloc_desc->type = RELO_SUBPROG_ADDR;
 +              reloc_desc->insn_idx = insn_idx;
 +              reloc_desc->sym_off = sym->st_value;
 +              return 0;
 +      }
 +
        type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx);
        sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx));
  
@@@ -3939,18 -3883,6 +3940,18 @@@ static int probe_kern_btf_datasec(void
                                             strs, sizeof(strs)));
  }
  
 +static int probe_kern_btf_float(void)
 +{
 +      static const char strs[] = "\0float";
 +      __u32 types[] = {
 +              /* float */
 +              BTF_TYPE_FLOAT_ENC(1, 4),
 +      };
 +
 +      return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types),
 +                                           strs, sizeof(strs)));
 +}
 +
  static int probe_kern_array_mmap(void)
  {
        struct bpf_create_map_attr attr = {
@@@ -4130,9 -4062,6 +4131,9 @@@ static struct kern_feature_desc 
        [FEAT_MODULE_BTF] = {
                "module BTF support", probe_module_btf,
        },
 +      [FEAT_BTF_FLOAT] = {
 +              "BTF_KIND_FLOAT support", probe_kern_btf_float,
 +      },
  };
  
  static bool kernel_supports(enum kern_feature_id feat_id)
@@@ -5638,6 -5567,11 +5639,6 @@@ static void bpf_core_poison_insn(struc
        insn->imm = 195896080; /* => 0xbad2310 => "bad relo" */
  }
  
 -static bool is_ldimm64(struct bpf_insn *insn)
 -{
 -      return insn->code == (BPF_LD | BPF_IMM | BPF_DW);
 -}
 -
  static int insn_bpf_size_to_bytes(struct bpf_insn *insn)
  {
        switch (BPF_SIZE(insn->code)) {
@@@ -6239,10 -6173,6 +6240,10 @@@ bpf_object__relocate_data(struct bpf_ob
                        }
                        relo->processed = true;
                        break;
 +              case RELO_SUBPROG_ADDR:
 +                      insn[0].src_reg = BPF_PSEUDO_FUNC;
 +                      /* will be handled as a follow up pass */
 +                      break;
                case RELO_CALL:
                        /* will be handled as a follow up pass */
                        break;
@@@ -6429,11 -6359,11 +6430,11 @@@ bpf_object__reloc_code(struct bpf_objec
  
        for (insn_idx = 0; insn_idx < prog->sec_insn_cnt; insn_idx++) {
                insn = &main_prog->insns[prog->sub_insn_off + insn_idx];
 -              if (!insn_is_subprog_call(insn))
 +              if (!insn_is_subprog_call(insn) && !insn_is_pseudo_func(insn))
                        continue;
  
                relo = find_prog_insn_relo(prog, insn_idx);
 -              if (relo && relo->type != RELO_CALL) {
 +              if (relo && relo->type != RELO_CALL && relo->type != RELO_SUBPROG_ADDR) {
                        pr_warn("prog '%s': unexpected relo for insn #%zu, type %d\n",
                                prog->name, insn_idx, relo->type);
                        return -LIBBPF_ERRNO__RELOC;
                         * call always has imm = -1, but for static functions
                         * relocation is against STT_SECTION and insn->imm
                         * points to a start of a static function
 +                       *
 +                       * for subprog addr relocation, the relo->sym_off + insn->imm is
 +                       * the byte offset in the corresponding section.
                         */
 -                      sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
 +                      if (relo->type == RELO_CALL)
 +                              sub_insn_idx = relo->sym_off / BPF_INSN_SZ + insn->imm + 1;
 +                      else
 +                              sub_insn_idx = (relo->sym_off + insn->imm) / BPF_INSN_SZ;
 +              } else if (insn_is_pseudo_func(insn)) {
 +                      /*
 +                       * RELO_SUBPROG_ADDR relo is always emitted even if both
 +                       * functions are in the same section, so it shouldn't reach here.
 +                       */
 +                      pr_warn("prog '%s': missing subprog addr relo for insn #%zu\n",
 +                              prog->name, insn_idx);
 +                      return -LIBBPF_ERRNO__RELOC;
                } else {
                        /* if subprogram call is to a static function within
                         * the same ELF section, there won't be any relocation
@@@ -174,6 -174,12 +174,12 @@@ struct struct_in_struct 
        };
  };
  
+ struct struct_in_array {};
+ struct struct_in_array_typed {};
+ typedef struct struct_in_array_typed struct_in_array_t[2];
  struct struct_with_embedded_stuff {
        int a;
        struct {
        } r[5];
        struct struct_in_struct s[10];
        int t[11];
+       struct struct_in_array (*u)[2];
+       struct_in_array_t *v;
  };
  
 +struct float_struct {
 +      float f;
 +      const double *d;
 +      volatile long double *ld;
 +};
 +
  struct root_struct {
        enum e1 _1;
        enum e2 _2;
        union_fwd_t *_12;
        union_fwd_ptr_t _13;
        struct struct_with_embedded_stuff _14;
 +      struct float_struct _15;
  };
  
  /* ------ END-EXPECTED-OUTPUT ------ */
@@@ -11,6 -11,7 +11,7 @@@ ksft_skip=
  timeout=30
  mptcp_connect=""
  capture=0
+ do_all_tests=1
  
  TEST_COUNT=0
  
@@@ -121,12 -122,6 +122,6 @@@ reset_with_add_addr_timeout(
                -j DROP
  }
  
- for arg in "$@"; do
-       if [ "$arg" = "-c" ]; then
-               capture=1
-       fi
- done
  ip -Version > /dev/null 2>&1
  if [ $? -ne 0 ];then
        echo "SKIP: Could not run test without ip tool"
@@@ -284,19 -279,14 +279,19 @@@ do_transfer(
                let rm_nr_ns1=-addr_nr_ns1
                if [ $rm_nr_ns1 -lt 8 ]; then
                        counter=1
 -                      sleep 1
 -
 -                      while [ $counter -le $rm_nr_ns1 ]
 -                      do
 -                              ip netns exec ${listener_ns} ./pm_nl_ctl del $counter
 +                      dump=(`ip netns exec ${listener_ns} ./pm_nl_ctl dump`)
 +                      if [ ${#dump[@]} -gt 0 ]; then
 +                              id=${dump[1]}
                                sleep 1
 -                              let counter+=1
 -                      done
 +
 +                              while [ $counter -le $rm_nr_ns1 ]
 +                              do
 +                                      ip netns exec ${listener_ns} ./pm_nl_ctl del $id
 +                                      sleep 1
 +                                      let counter+=1
 +                                      let id+=1
 +                              done
 +                      fi
                else
                        sleep 1
                        ip netns exec ${listener_ns} ./pm_nl_ctl flush
                let rm_nr_ns2=-addr_nr_ns2
                if [ $rm_nr_ns2 -lt 8 ]; then
                        counter=1
 -                      sleep 1
 -
 -                      while [ $counter -le $rm_nr_ns2 ]
 -                      do
 -                              ip netns exec ${connector_ns} ./pm_nl_ctl del $counter
 +                      dump=(`ip netns exec ${connector_ns} ./pm_nl_ctl dump`)
 +                      if [ ${#dump[@]} -gt 0 ]; then
 +                              id=${dump[1]}
                                sleep 1
 -                              let counter+=1
 -                      done
 +
 +                              while [ $counter -le $rm_nr_ns2 ]
 +                              do
 +                                      ip netns exec ${connector_ns} ./pm_nl_ctl del $id
 +                                      sleep 1
 +                                      let counter+=1
 +                                      let id+=1
 +                              done
 +                      fi
                else
                        sleep 1
                        ip netns exec ${connector_ns} ./pm_nl_ctl flush
@@@ -620,22 -605,11 +615,22 @@@ chk_rm_nr(
  {
        local rm_addr_nr=$1
        local rm_subflow_nr=$2
 +      local invert=${3:-""}
        local count
        local dump_stats
 +      local addr_ns
 +      local subflow_ns
 +
 +      if [ -z $invert ]; then
 +              addr_ns=$ns1
 +              subflow_ns=$ns2
 +      elif [ $invert = "invert" ]; then
 +              addr_ns=$ns2
 +              subflow_ns=$ns1
 +      fi
  
        printf "%-39s %s" " " "rm "
 -      count=`ip netns exec $ns1 nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
 +      count=`ip netns exec $addr_ns nstat -as | grep MPTcpExtRmAddr | awk '{print $2}'`
        [ -z "$count" ] && count=0
        if [ "$count" != "$rm_addr_nr" ]; then
                echo "[fail] got $count RM_ADDR[s] expected $rm_addr_nr"
        fi
  
        echo -n " - sf    "
 -      count=`ip netns exec $ns2 nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
 +      count=`ip netns exec $subflow_ns nstat -as | grep MPTcpExtRmSubflow | awk '{print $2}'`
        [ -z "$count" ] && count=0
        if [ "$count" != "$rm_subflow_nr" ]; then
                echo "[fail] got $count RM_SUBFLOW[s] expected $rm_subflow_nr"
@@@ -854,7 -828,7 +849,7 @@@ remove_tests(
        run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
        chk_join_nr "remove single address" 1 1 1
        chk_add_nr 1 1
 -      chk_rm_nr 0 0
 +      chk_rm_nr 1 1 invert
  
        # subflow and signal, remove
        reset
        chk_join_nr "flush subflows and signal" 3 3 3
        chk_add_nr 1 1
        chk_rm_nr 2 2
 +
 +      # subflows flush
 +      reset
 +      ip netns exec $ns1 ./pm_nl_ctl limits 3 3
 +      ip netns exec $ns2 ./pm_nl_ctl limits 3 3
 +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.2.2 flags subflow id 150
 +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags subflow
 +      ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags subflow
 +      run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
 +      chk_join_nr "flush subflows" 3 3 3
 +      chk_rm_nr 3 3
 +
 +      # addresses flush
 +      reset
 +      ip netns exec $ns1 ./pm_nl_ctl limits 3 3
 +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.2.1 flags signal id 250
 +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.3.1 flags signal
 +      ip netns exec $ns1 ./pm_nl_ctl add 10.0.4.1 flags signal
 +      ip netns exec $ns2 ./pm_nl_ctl limits 3 3
 +      run_tests $ns1 $ns2 10.0.1.1 0 -8 -8 slow
 +      chk_join_nr "flush addresses" 3 3 3
 +      chk_add_nr 3 3
 +      chk_rm_nr 3 3 invert
  }
  
  add_tests()
@@@ -989,7 -940,7 +984,7 @@@ ipv6_tests(
        run_tests $ns1 $ns2 dead:beef:1::1 0 -1 0 slow
        chk_join_nr "remove single address IPv6" 1 1 1
        chk_add_nr 1 1
 -      chk_rm_nr 0 0
 +      chk_rm_nr 1 1 invert
  
        # subflow and signal IPv6, remove
        reset
@@@ -1132,7 -1083,7 +1127,7 @@@ add_addr_ports_tests(
        run_tests $ns1 $ns2 10.0.1.1 0 -1 0 slow
        chk_join_nr "remove single address with port" 1 1 1
        chk_add_nr 1 1 1
 -      chk_rm_nr 0 0
 +      chk_rm_nr 1 1 invert
  
        # subflow and signal with port, remove
        reset
@@@ -1265,7 -1216,8 +1260,8 @@@ usage(
        echo "  -4 v4mapped_tests"
        echo "  -b backup_tests"
        echo "  -p add_addr_ports_tests"
-       echo "  -c syncookies_tests"
+       echo "  -k syncookies_tests"
+       echo "  -c capture pcap files"
        echo "  -h help"
  }
  
@@@ -1279,12 -1231,24 +1275,24 @@@ make_file "$cin" "client" 
  make_file "$sin" "server" 1
  trap cleanup EXIT
  
- if [ -z $1 ]; then
+ for arg in "$@"; do
+       # check for "capture" arg before launching tests
+       if [[ "${arg}" =~ ^"-"[0-9a-zA-Z]*"c"[0-9a-zA-Z]*$ ]]; then
+               capture=1
+       fi
+       # exception for the capture option, the rest means: a part of the tests
+       if [ "${arg}" != "-c" ]; then
+               do_all_tests=0
+       fi
+ done
+ if [ $do_all_tests -eq 1 ]; then
        all_tests
        exit $ret
  fi
  
- while getopts 'fsltra64bpch' opt; do
+ while getopts 'fsltra64bpkch' opt; do
        case $opt in
                f)
                        subflows_tests
                p)
                        add_addr_ports_tests
                        ;;
-               c)
+               k)
                        syncookies_tests
                        ;;
+               c)
+                       ;;
                h | *)
                        usage
                        ;;