Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net
authorJakub Kicinski <kuba@kernel.org>
Tue, 31 Aug 2021 16:06:04 +0000 (09:06 -0700)
committerJakub Kicinski <kuba@kernel.org>
Tue, 31 Aug 2021 16:06:04 +0000 (09:06 -0700)
include/linux/netdevice.h
net/socket.c

  d0efb16294d1 ("net: don't unconditionally copy_from_user a struct ifreq for socket ioctls")

  876f0bf9d0d5 ("net: socket: simplify dev_ifconf handling")
  29c4964822aa ("net: socket: rework compat_ifreq_ioctl()")

Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1  2 
drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
drivers/net/ethernet/intel/ice/ice_main.c
drivers/net/ethernet/mellanox/mlx5/core/dev.c
drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
drivers/net/ethernet/mellanox/mlx5/core/lag.c
drivers/net/ethernet/mellanox/mlx5/core/lag_mp.c
drivers/net/ethernet/mellanox/mlx5/core/steering/dr_rule.c
drivers/net/phy/marvell10g.c
include/linux/netdevice.h
net/socket.c

@@@ -119,10 -119,16 +119,10 @@@ static int aq_pci_func_init(struct pci_
  {
        int err;
  
 -      err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
 -      if (!err)
 -              err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
 +      err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
 +      if (err)
 +              err = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32));
        if (err) {
 -              err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
 -              if (!err)
 -                      err = pci_set_consistent_dma_mask(pdev,
 -                                                        DMA_BIT_MASK(32));
 -      }
 -      if (err != 0) {
                err = -ENOSR;
                goto err_exit;
        }
@@@ -411,6 -417,9 +411,9 @@@ static int atl_resume_common(struct dev
        pci_restore_state(pdev);
  
        if (deep) {
+               /* Reinitialize Nic/Vecs objects */
+               aq_nic_deinit(nic, !nic->aq_hw->aq_nic_cfg->wol);
                ret = aq_nic_init(nic);
                if (ret)
                        goto err_exit;
@@@ -5122,6 -5122,7 +5122,7 @@@ static int ice_set_mac_address(struct n
        struct ice_hw *hw = &pf->hw;
        struct sockaddr *addr = pi;
        enum ice_status status;
+       u8 old_mac[ETH_ALEN];
        u8 flags = 0;
        int err = 0;
        u8 *mac;
        }
  
        netif_addr_lock_bh(netdev);
+       ether_addr_copy(old_mac, netdev->dev_addr);
+       /* change the netdev's MAC address */
+       memcpy(netdev->dev_addr, mac, netdev->addr_len);
+       netif_addr_unlock_bh(netdev);
        /* Clean up old MAC filter. Not an error if old filter doesn't exist */
-       status = ice_fltr_remove_mac(vsi, netdev->dev_addr, ICE_FWD_TO_VSI);
+       status = ice_fltr_remove_mac(vsi, old_mac, ICE_FWD_TO_VSI);
        if (status && status != ICE_ERR_DOES_NOT_EXIST) {
                err = -EADDRNOTAVAIL;
                goto err_update_filters;
@@@ -5168,13 -5174,12 +5174,12 @@@ err_update_filters
        if (err) {
                netdev_err(netdev, "can't set MAC %pM. filter update failed\n",
                           mac);
+               netif_addr_lock_bh(netdev);
+               ether_addr_copy(netdev->dev_addr, old_mac);
                netif_addr_unlock_bh(netdev);
                return err;
        }
  
-       /* change the netdev's MAC address */
-       memcpy(netdev->dev_addr, mac, netdev->addr_len);
-       netif_addr_unlock_bh(netdev);
        netdev_dbg(vsi->netdev, "updated MAC address to %pM\n",
                   netdev->dev_addr);
  
@@@ -6570,12 -6575,12 +6575,12 @@@ event_after
  }
  
  /**
 - * ice_do_ioctl - Access the hwtstamp interface
 + * ice_eth_ioctl - Access the hwtstamp interface
   * @netdev: network interface device structure
   * @ifr: interface request data
   * @cmd: ioctl command
   */
 -static int ice_do_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
 +static int ice_eth_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd)
  {
        struct ice_netdev_priv *np = netdev_priv(netdev);
        struct ice_pf *pf = np->vsi->back;
@@@ -7241,7 -7246,7 +7246,7 @@@ static const struct net_device_ops ice_
        .ndo_change_mtu = ice_change_mtu,
        .ndo_get_stats64 = ice_get_stats64,
        .ndo_set_tx_maxrate = ice_set_tx_maxrate,
 -      .ndo_do_ioctl = ice_do_ioctl,
 +      .ndo_eth_ioctl = ice_eth_ioctl,
        .ndo_set_vf_spoofchk = ice_set_vf_spoofchk,
        .ndo_set_vf_mac = ice_set_vf_mac,
        .ndo_get_vf_config = ice_get_vf_cfg,
@@@ -53,7 -53,7 +53,7 @@@ static bool is_eth_rep_supported(struc
        return true;
  }
  
 -static bool is_eth_supported(struct mlx5_core_dev *dev)
 +bool mlx5_eth_supported(struct mlx5_core_dev *dev)
  {
        if (!IS_ENABLED(CONFIG_MLX5_CORE_EN))
                return false;
        return true;
  }
  
 -static bool is_vnet_supported(struct mlx5_core_dev *dev)
 +static bool is_eth_enabled(struct mlx5_core_dev *dev)
 +{
 +      union devlink_param_value val;
 +      int err;
 +
 +      err = devlink_param_driverinit_value_get(priv_to_devlink(dev),
 +                                               DEVLINK_PARAM_GENERIC_ID_ENABLE_ETH,
 +                                               &val);
 +      return err ? false : val.vbool;
 +}
 +
 +bool mlx5_vnet_supported(struct mlx5_core_dev *dev)
  {
        if (!IS_ENABLED(CONFIG_MLX5_VDPA_NET))
                return false;
        return true;
  }
  
 +static bool is_vnet_enabled(struct mlx5_core_dev *dev)
 +{
 +      union devlink_param_value val;
 +      int err;
 +
 +      err = devlink_param_driverinit_value_get(priv_to_devlink(dev),
 +                                               DEVLINK_PARAM_GENERIC_ID_ENABLE_VNET,
 +                                               &val);
 +      return err ? false : val.vbool;
 +}
 +
  static bool is_ib_rep_supported(struct mlx5_core_dev *dev)
  {
        if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND))
@@@ -192,7 -170,7 +192,7 @@@ static bool is_mp_supported(struct mlx5
        return true;
  }
  
 -static bool is_ib_supported(struct mlx5_core_dev *dev)
 +bool mlx5_rdma_supported(struct mlx5_core_dev *dev)
  {
        if (!IS_ENABLED(CONFIG_MLX5_INFINIBAND))
                return false;
        return true;
  }
  
 +static bool is_ib_enabled(struct mlx5_core_dev *dev)
 +{
 +      union devlink_param_value val;
 +      int err;
 +
 +      err = devlink_param_driverinit_value_get(priv_to_devlink(dev),
 +                                               DEVLINK_PARAM_GENERIC_ID_ENABLE_RDMA,
 +                                               &val);
 +      return err ? false : val.vbool;
 +}
 +
  enum {
        MLX5_INTERFACE_PROTOCOL_ETH,
        MLX5_INTERFACE_PROTOCOL_ETH_REP,
  static const struct mlx5_adev_device {
        const char *suffix;
        bool (*is_supported)(struct mlx5_core_dev *dev);
 +      bool (*is_enabled)(struct mlx5_core_dev *dev);
  } mlx5_adev_devices[] = {
        [MLX5_INTERFACE_PROTOCOL_VNET] = { .suffix = "vnet",
 -                                         .is_supported = &is_vnet_supported },
 +                                         .is_supported = &mlx5_vnet_supported,
 +                                         .is_enabled = &is_vnet_enabled },
        [MLX5_INTERFACE_PROTOCOL_IB] = { .suffix = "rdma",
 -                                       .is_supported = &is_ib_supported },
 +                                       .is_supported = &mlx5_rdma_supported,
 +                                       .is_enabled = &is_ib_enabled },
        [MLX5_INTERFACE_PROTOCOL_ETH] = { .suffix = "eth",
 -                                        .is_supported = &is_eth_supported },
 +                                        .is_supported = &mlx5_eth_supported,
 +                                        .is_enabled = &is_eth_enabled },
        [MLX5_INTERFACE_PROTOCOL_ETH_REP] = { .suffix = "eth-rep",
                                           .is_supported = &is_eth_rep_supported },
        [MLX5_INTERFACE_PROTOCOL_IB_REP] = { .suffix = "rdma-rep",
@@@ -345,14 -308,6 +345,14 @@@ int mlx5_attach_device(struct mlx5_core
                if (!priv->adev[i]) {
                        bool is_supported = false;
  
 +                      if (mlx5_adev_devices[i].is_enabled) {
 +                              bool enabled;
 +
 +                              enabled = mlx5_adev_devices[i].is_enabled(dev);
 +                              if (!enabled)
 +                                      continue;
 +                      }
 +
                        if (mlx5_adev_devices[i].is_supported)
                                is_supported = mlx5_adev_devices[i].is_supported(dev);
  
@@@ -405,14 -360,6 +405,14 @@@ void mlx5_detach_device(struct mlx5_cor
                if (!priv->adev[i])
                        continue;
  
 +              if (mlx5_adev_devices[i].is_enabled) {
 +                      bool enabled;
 +
 +                      enabled = mlx5_adev_devices[i].is_enabled(dev);
 +                      if (!enabled)
 +                              goto skip_suspend;
 +              }
 +
                adev = &priv->adev[i]->adev;
                /* Auxiliary driver was unbind manually through sysfs */
                if (!adev->dev.driver)
@@@ -450,7 -397,7 +450,7 @@@ int mlx5_register_device(struct mlx5_co
  void mlx5_unregister_device(struct mlx5_core_dev *dev)
  {
        mutex_lock(&mlx5_intf_mutex);
-       dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
+       dev->priv.flags = MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV;
        mlx5_rescan_drivers_locked(dev);
        mutex_unlock(&mlx5_intf_mutex);
  }
@@@ -500,21 -447,12 +500,21 @@@ static void delete_drivers(struct mlx5_
                if (!priv->adev[i])
                        continue;
  
 +              if (mlx5_adev_devices[i].is_enabled) {
 +                      bool enabled;
 +
 +                      enabled = mlx5_adev_devices[i].is_enabled(dev);
 +                      if (!enabled)
 +                              goto del_adev;
 +              }
 +
                if (mlx5_adev_devices[i].is_supported && !delete_all)
                        is_supported = mlx5_adev_devices[i].is_supported(dev);
  
                if (is_supported)
                        continue;
  
 +del_adev:
                del_adev(&priv->adev[i]->adev);
                priv->adev[i] = NULL;
        }
  #include <net/flow_offload.h>
  #include <net/sch_generic.h>
  #include <net/pkt_cls.h>
 -#include <net/tc_act/tc_gact.h>
 -#include <net/tc_act/tc_skbedit.h>
  #include <linux/mlx5/fs.h>
  #include <linux/mlx5/device.h>
  #include <linux/rhashtable.h>
  #include <linux/refcount.h>
  #include <linux/completion.h>
 -#include <net/tc_act/tc_mirred.h>
 -#include <net/tc_act/tc_vlan.h>
 -#include <net/tc_act/tc_tunnel_key.h>
  #include <net/tc_act/tc_pedit.h>
  #include <net/tc_act/tc_csum.h>
 -#include <net/tc_act/tc_mpls.h>
  #include <net/psample.h>
  #include <net/arp.h>
  #include <net/ipv6_stubs.h>
  #include <net/bareudp.h>
  #include <net/bonding.h>
  #include "en.h"
 +#include "en/tc/post_act.h"
  #include "en_rep.h"
  #include "en/rep/tc.h"
  #include "en/rep/neigh.h"
@@@ -61,7 -66,7 +61,7 @@@
  #include "en/mod_hdr.h"
  #include "en/tc_priv.h"
  #include "en/tc_tun_encap.h"
 -#include "esw/sample.h"
 +#include "en/tc/sample.h"
  #include "lib/devcom.h"
  #include "lib/geneve.h"
  #include "lib/fs_chains.h"
@@@ -98,7 -103,7 +98,7 @@@ struct mlx5e_tc_attr_to_reg_mapping mlx
        [MARK_TO_REG] = mark_to_reg_ct,
        [LABELS_TO_REG] = labels_to_reg_ct,
        [FTEID_TO_REG] = fteid_to_reg_ct,
 -      /* For NIC rules we store the retore metadata directly
 +      /* For NIC rules we store the restore metadata directly
         * into reg_b that is passed to SW since we don't
         * jump between steering domains.
         */
@@@ -247,7 -252,7 +247,7 @@@ get_ct_priv(struct mlx5e_priv *priv
  }
  
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
 -static struct mlx5_esw_psample *
 +static struct mlx5e_tc_psample *
  get_sample_priv(struct mlx5e_priv *priv)
  {
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
                uplink_rpriv = mlx5_eswitch_get_uplink_priv(esw, REP_ETH);
                uplink_priv = &uplink_rpriv->uplink_priv;
  
 -              return uplink_priv->esw_psample;
 +              return uplink_priv->tc_psample;
        }
  
        return NULL;
@@@ -335,12 -340,12 +335,12 @@@ struct mlx5e_hairpin 
        struct mlx5_core_dev *func_mdev;
        struct mlx5e_priv *func_priv;
        u32 tdn;
 -      u32 tirn;
 +      struct mlx5e_tir direct_tir;
  
        int num_channels;
        struct mlx5e_rqt indir_rqt;
 -      u32 indir_tirn[MLX5E_NUM_INDIR_TIRS];
 -      struct mlx5e_ttc_table ttc;
 +      struct mlx5e_tir indir_tir[MLX5E_NUM_INDIR_TIRS];
 +      struct mlx5_ttc_table *ttc;
  };
  
  struct mlx5e_hairpin_entry {
@@@ -477,101 -482,126 +477,101 @@@ struct mlx5_core_dev *mlx5e_hairpin_get
  
  static int mlx5e_hairpin_create_transport(struct mlx5e_hairpin *hp)
  {
 -      u32 in[MLX5_ST_SZ_DW(create_tir_in)] = {};
 -      void *tirc;
 +      struct mlx5e_tir_builder *builder;
        int err;
  
 +      builder = mlx5e_tir_builder_alloc(false);
 +      if (!builder)
 +              return -ENOMEM;
 +
        err = mlx5_core_alloc_transport_domain(hp->func_mdev, &hp->tdn);
        if (err)
 -              goto alloc_tdn_err;
 -
 -      tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 -
 -      MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT);
 -      MLX5_SET(tirc, tirc, inline_rqn, hp->pair->rqn[0]);
 -      MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
 +              goto out;
  
 -      err = mlx5_core_create_tir(hp->func_mdev, in, &hp->tirn);
 +      mlx5e_tir_builder_build_inline(builder, hp->tdn, hp->pair->rqn[0]);
 +      err = mlx5e_tir_init(&hp->direct_tir, builder, hp->func_mdev, false);
        if (err)
                goto create_tir_err;
  
 -      return 0;
 +out:
 +      mlx5e_tir_builder_free(builder);
 +      return err;
  
  create_tir_err:
        mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
 -alloc_tdn_err:
 -      return err;
 +
 +      goto out;
  }
  
  static void mlx5e_hairpin_destroy_transport(struct mlx5e_hairpin *hp)
  {
 -      mlx5_core_destroy_tir(hp->func_mdev, hp->tirn);
 +      mlx5e_tir_destroy(&hp->direct_tir);
        mlx5_core_dealloc_transport_domain(hp->func_mdev, hp->tdn);
  }
  
 -static int mlx5e_hairpin_fill_rqt_rqns(struct mlx5e_hairpin *hp, void *rqtc)
 -{
 -      struct mlx5e_priv *priv = hp->func_priv;
 -      int i, ix, sz = MLX5E_INDIR_RQT_SIZE;
 -      u32 *indirection_rqt, rqn;
 -
 -      indirection_rqt = kcalloc(sz, sizeof(*indirection_rqt), GFP_KERNEL);
 -      if (!indirection_rqt)
 -              return -ENOMEM;
 -
 -      mlx5e_build_default_indir_rqt(indirection_rqt, sz,
 -                                    hp->num_channels);
 -
 -      for (i = 0; i < sz; i++) {
 -              ix = i;
 -              if (priv->rss_params.hfunc == ETH_RSS_HASH_XOR)
 -                      ix = mlx5e_bits_invert(i, ilog2(sz));
 -              ix = indirection_rqt[ix];
 -              rqn = hp->pair->rqn[ix];
 -              MLX5_SET(rqtc, rqtc, rq_num[i], rqn);
 -      }
 -
 -      kfree(indirection_rqt);
 -      return 0;
 -}
 -
  static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
  {
 -      int inlen, err, sz = MLX5E_INDIR_RQT_SIZE;
        struct mlx5e_priv *priv = hp->func_priv;
        struct mlx5_core_dev *mdev = priv->mdev;
 -      void *rqtc;
 -      u32 *in;
 +      struct mlx5e_rss_params_indir *indir;
 +      int err;
  
 -      inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + sizeof(u32) * sz;
 -      in = kvzalloc(inlen, GFP_KERNEL);
 -      if (!in)
 +      indir = kvmalloc(sizeof(*indir), GFP_KERNEL);
 +      if (!indir)
                return -ENOMEM;
  
 -      rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
 +      mlx5e_rss_params_indir_init_uniform(indir, hp->num_channels);
 +      err = mlx5e_rqt_init_indir(&hp->indir_rqt, mdev, hp->pair->rqn, hp->num_channels,
 +                                 mlx5e_rx_res_get_current_hash(priv->rx_res).hfunc,
 +                                 indir);
  
 -      MLX5_SET(rqtc, rqtc, rqt_actual_size, sz);
 -      MLX5_SET(rqtc, rqtc, rqt_max_size, sz);
 -
 -      err = mlx5e_hairpin_fill_rqt_rqns(hp, rqtc);
 -      if (err)
 -              goto out;
 -
 -      err = mlx5_core_create_rqt(mdev, in, inlen, &hp->indir_rqt.rqtn);
 -      if (!err)
 -              hp->indir_rqt.enabled = true;
 -
 -out:
 -      kvfree(in);
 +      kvfree(indir);
        return err;
  }
  
  static int mlx5e_hairpin_create_indirect_tirs(struct mlx5e_hairpin *hp)
  {
        struct mlx5e_priv *priv = hp->func_priv;
 -      u32 in[MLX5_ST_SZ_DW(create_tir_in)];
 -      int tt, i, err;
 -      void *tirc;
 +      struct mlx5e_rss_params_hash rss_hash;
 +      enum mlx5_traffic_types tt, max_tt;
 +      struct mlx5e_tir_builder *builder;
 +      int err = 0;
 +
 +      builder = mlx5e_tir_builder_alloc(false);
 +      if (!builder)
 +              return -ENOMEM;
 +
 +      rss_hash = mlx5e_rx_res_get_current_hash(priv->rx_res);
  
        for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++) {
 -              struct mlx5e_tirc_config ttconfig = mlx5e_tirc_get_default_config(tt);
 +              struct mlx5e_rss_params_traffic_type rss_tt;
  
 -              memset(in, 0, MLX5_ST_SZ_BYTES(create_tir_in));
 -              tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
 +              rss_tt = mlx5e_rss_get_default_tt_config(tt);
  
 -              MLX5_SET(tirc, tirc, transport_domain, hp->tdn);
 -              MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
 -              MLX5_SET(tirc, tirc, indirect_table, hp->indir_rqt.rqtn);
 -              mlx5e_build_indir_tir_ctx_hash(&priv->rss_params, &ttconfig, tirc, false);
 +              mlx5e_tir_builder_build_rqt(builder, hp->tdn,
 +                                          mlx5e_rqt_get_rqtn(&hp->indir_rqt),
 +                                          false);
 +              mlx5e_tir_builder_build_rss(builder, &rss_hash, &rss_tt, false);
  
 -              err = mlx5_core_create_tir(hp->func_mdev, in,
 -                                         &hp->indir_tirn[tt]);
 +              err = mlx5e_tir_init(&hp->indir_tir[tt], builder, hp->func_mdev, false);
                if (err) {
                        mlx5_core_warn(hp->func_mdev, "create indirect tirs failed, %d\n", err);
                        goto err_destroy_tirs;
                }
 +
 +              mlx5e_tir_builder_clear(builder);
        }
 -      return 0;
  
 -err_destroy_tirs:
 -      for (i = 0; i < tt; i++)
 -              mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[i]);
 +out:
 +      mlx5e_tir_builder_free(builder);
        return err;
 +
 +err_destroy_tirs:
 +      max_tt = tt;
 +      for (tt = 0; tt < max_tt; tt++)
 +              mlx5e_tir_destroy(&hp->indir_tir[tt]);
 +
 +      goto out;
  }
  
  static void mlx5e_hairpin_destroy_indirect_tirs(struct mlx5e_hairpin *hp)
        int tt;
  
        for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
 -              mlx5_core_destroy_tir(hp->func_mdev, hp->indir_tirn[tt]);
 +              mlx5e_tir_destroy(&hp->indir_tir[tt]);
  }
  
  static void mlx5e_hairpin_set_ttc_params(struct mlx5e_hairpin *hp,
  
        memset(ttc_params, 0, sizeof(*ttc_params));
  
 -      ttc_params->any_tt_tirn = hp->tirn;
 -
 -      for (tt = 0; tt < MLX5E_NUM_INDIR_TIRS; tt++)
 -              ttc_params->indir_tirn[tt] = hp->indir_tirn[tt];
 +      ttc_params->ns = mlx5_get_flow_namespace(hp->func_mdev,
 +                                               MLX5_FLOW_NAMESPACE_KERNEL);
 +      for (tt = 0; tt < MLX5_NUM_TT; tt++) {
 +              ttc_params->dests[tt].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
 +              ttc_params->dests[tt].tir_num =
 +                      tt == MLX5_TT_ANY ?
 +                              mlx5e_tir_get_tirn(&hp->direct_tir) :
 +                              mlx5e_tir_get_tirn(&hp->indir_tir[tt]);
 +      }
  
 -      ft_attr->max_fte = MLX5E_TTC_TABLE_SIZE;
        ft_attr->level = MLX5E_TC_TTC_FT_LEVEL;
        ft_attr->prio = MLX5E_TC_PRIO;
  }
@@@ -619,31 -645,30 +619,31 @@@ static int mlx5e_hairpin_rss_init(struc
                goto err_create_indirect_tirs;
  
        mlx5e_hairpin_set_ttc_params(hp, &ttc_params);
 -      err = mlx5e_create_ttc_table(priv, &ttc_params, &hp->ttc);
 -      if (err)
 +      hp->ttc = mlx5_create_ttc_table(priv->mdev, &ttc_params);
 +      if (IS_ERR(hp->ttc)) {
 +              err = PTR_ERR(hp->ttc);
                goto err_create_ttc_table;
 +      }
  
        netdev_dbg(priv->netdev, "add hairpin: using %d channels rss ttc table id %x\n",
 -                 hp->num_channels, hp->ttc.ft.t->id);
 +                 hp->num_channels,
 +                 mlx5_get_ttc_flow_table(priv->fs.ttc)->id);
  
        return 0;
  
  err_create_ttc_table:
        mlx5e_hairpin_destroy_indirect_tirs(hp);
  err_create_indirect_tirs:
 -      mlx5e_destroy_rqt(priv, &hp->indir_rqt);
 +      mlx5e_rqt_destroy(&hp->indir_rqt);
  
        return err;
  }
  
  static void mlx5e_hairpin_rss_cleanup(struct mlx5e_hairpin *hp)
  {
 -      struct mlx5e_priv *priv = hp->func_priv;
 -
 -      mlx5e_destroy_ttc_table(priv, &hp->ttc);
 +      mlx5_destroy_ttc_table(hp->ttc);
        mlx5e_hairpin_destroy_indirect_tirs(hp);
 -      mlx5e_destroy_rqt(priv, &hp->indir_rqt);
 +      mlx5e_rqt_destroy(&hp->indir_rqt);
  }
  
  static struct mlx5e_hairpin *
@@@ -878,17 -903,16 +878,17 @@@ static int mlx5e_hairpin_flow_add(struc
        }
  
        netdev_dbg(priv->netdev, "add hairpin: tirn %x rqn %x peer %s sqn %x prio %d (log) data %d packets %d\n",
 -                 hp->tirn, hp->pair->rqn[0],
 +                 mlx5e_tir_get_tirn(&hp->direct_tir), hp->pair->rqn[0],
                   dev_name(hp->pair->peer_mdev->device),
                   hp->pair->sqn[0], match_prio, params.log_data_size, params.log_num_packets);
  
  attach_flow:
        if (hpe->hp->num_channels > 1) {
                flow_flag_set(flow, HAIRPIN_RSS);
 -              flow->attr->nic_attr->hairpin_ft = hpe->hp->ttc.ft.t;
 +              flow->attr->nic_attr->hairpin_ft =
 +                      mlx5_get_ttc_flow_table(hpe->hp->ttc);
        } else {
 -              flow->attr->nic_attr->hairpin_tirn = hpe->hp->tirn;
 +              flow->attr->nic_attr->hairpin_tirn = mlx5e_tir_get_tirn(&hpe->hp->direct_tir);
        }
  
        flow->hpe = hpe;
@@@ -1032,17 -1056,15 +1032,17 @@@ err_ft_get
  
  static int
  mlx5e_tc_add_nic_flow(struct mlx5e_priv *priv,
 -                    struct mlx5e_tc_flow_parse_attr *parse_attr,
                      struct mlx5e_tc_flow *flow,
                      struct netlink_ext_ack *extack)
  {
 +      struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
        struct mlx5_core_dev *dev = priv->mdev;
 -      struct mlx5_fc *counter = NULL;
 +      struct mlx5_fc *counter;
        int err;
  
 +      parse_attr = attr->parse_attr;
 +
        if (flow_flag_test(flow, HAIRPIN)) {
                err = mlx5e_hairpin_flow_add(priv, flow, parse_attr, extack);
                if (err)
@@@ -1148,8 -1170,7 +1148,8 @@@ mlx5e_tc_offload_fdb_rules(struct mlx5_
                                               mod_hdr_acts);
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
        } else if (flow_flag_test(flow, SAMPLE)) {
 -              rule = mlx5_esw_sample_offload(get_sample_priv(flow->priv), spec, attr);
 +              rule = mlx5e_tc_sample_offload(get_sample_priv(flow->priv), spec, attr,
 +                                             mlx5e_tc_get_flow_tun_id(flow));
  #endif
        } else {
                rule = mlx5_eswitch_add_offloaded_rule(esw, spec, attr);
@@@ -1188,7 -1209,7 +1188,7 @@@ void mlx5e_tc_unoffload_fdb_rules(struc
  
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
        if (flow_flag_test(flow, SAMPLE)) {
 -              mlx5_esw_sample_unoffload(get_sample_priv(flow->priv), flow->rule[0], attr);
 +              mlx5e_tc_sample_unoffload(get_sample_priv(flow->priv), flow->rule[0], attr);
                return;
        }
  #endif
@@@ -1317,6 -1338,7 +1317,7 @@@ bool mlx5e_tc_is_vf_tunnel(struct net_d
  int mlx5e_tc_query_route_vport(struct net_device *out_dev, struct net_device *route_dev, u16 *vport)
  {
        struct mlx5e_priv *out_priv, *route_priv;
+       struct mlx5_devcom *devcom = NULL;
        struct mlx5_core_dev *route_mdev;
        struct mlx5_eswitch *esw;
        u16 vhca_id;
        route_mdev = route_priv->mdev;
  
        vhca_id = MLX5_CAP_GEN(route_mdev, vhca_id);
+       if (mlx5_lag_is_active(out_priv->mdev)) {
+               /* In lag case we may get devices from different eswitch instances.
+                * If we failed to get vport num, it means, mostly, that we on the wrong
+                * eswitch.
+                */
+               err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
+               if (err != -ENOENT)
+                       return err;
+               devcom = out_priv->mdev->priv.devcom;
+               esw = mlx5_devcom_get_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
+               if (!esw)
+                       return -ENODEV;
+       }
        err = mlx5_eswitch_vhca_id_to_vport(esw, vhca_id, vport);
+       if (devcom)
+               mlx5_devcom_release_peer_data(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
        return err;
  }
  
@@@ -1363,9 -1402,9 +1381,9 @@@ mlx5e_tc_add_fdb_flow(struct mlx5e_pri
        bool vf_tun = false, encap_valid = true;
        struct net_device *encap_dev = NULL;
        struct mlx5_esw_flow_attr *esw_attr;
 -      struct mlx5_fc *counter = NULL;
        struct mlx5e_rep_priv *rpriv;
        struct mlx5e_priv *out_priv;
 +      struct mlx5_fc *counter;
        u32 max_prio, max_chain;
        int err = 0;
        int out_index;
@@@ -1552,7 -1591,6 +1570,7 @@@ static void mlx5e_tc_del_fdb_flow(struc
                else
                        mlx5e_detach_mod_hdr(priv, flow);
        }
 +      kfree(attr->sample_attr);
        kvfree(attr->parse_attr);
        kvfree(attr->esw_attr->rx_tun_attr);
  
        if (flow_flag_test(flow, L3_TO_L2_DECAP))
                mlx5e_detach_decap(priv, flow);
  
 -      kfree(flow->attr->esw_attr->sample);
        kfree(flow->attr);
  }
  
@@@ -1626,22 -1665,17 +1644,22 @@@ static void mlx5e_tc_del_flow(struct ml
        }
  }
  
 -static int flow_has_tc_fwd_action(struct flow_cls_offload *f)
 +static bool flow_requires_tunnel_mapping(u32 chain, struct flow_cls_offload *f)
  {
        struct flow_rule *rule = flow_cls_offload_flow_rule(f);
        struct flow_action *flow_action = &rule->action;
        const struct flow_action_entry *act;
        int i;
  
 +      if (chain)
 +              return false;
 +
        flow_action_for_each(i, act, flow_action) {
                switch (act->id) {
                case FLOW_ACTION_GOTO:
                        return true;
 +              case FLOW_ACTION_SAMPLE:
 +                      return true;
                default:
                        continue;
                }
@@@ -1882,7 -1916,7 +1900,7 @@@ static int parse_tunnel_attr(struct mlx
                return -EOPNOTSUPP;
  
        needs_mapping = !!flow->attr->chain;
 -      sets_mapping = !flow->attr->chain && flow_has_tc_fwd_action(f);
 +      sets_mapping = flow_requires_tunnel_mapping(flow->attr->chain, f);
        *match_inner = !needs_mapping;
  
        if ((needs_mapping || sets_mapping) &&
@@@ -2455,7 -2489,7 +2473,7 @@@ static int __parse_cls_flower(struct ml
                        spec->match_criteria_enable |= MLX5_MATCH_MISC_PARAMETERS_3;
                }
        }
 -      /* Currenlty supported only for MPLS over UDP */
 +      /* Currently supported only for MPLS over UDP */
        if (flow_rule_match_key(rule, FLOW_DISSECTOR_KEY_MPLS) &&
            !netif_is_bareudp(filter_dev)) {
                NL_SET_ERR_MSG_MOD(extack,
@@@ -2709,9 -2743,7 +2727,9 @@@ static int offload_pedit_fields(struct 
                if (s_mask && a_mask) {
                        NL_SET_ERR_MSG_MOD(extack,
                                           "can't set and add to the same HW field");
 -                      printk(KERN_WARNING "mlx5: can't set and add to the same HW field (%x)\n", f->field);
 +                      netdev_warn(priv->netdev,
 +                                  "mlx5: can't set and add to the same HW field (%x)\n",
 +                                  f->field);
                        return -EOPNOTSUPP;
                }
  
                if (first < next_z && next_z < last) {
                        NL_SET_ERR_MSG_MOD(extack,
                                           "rewrite of few sub-fields isn't supported");
 -                      printk(KERN_WARNING "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n",
 -                             mask);
 +                      netdev_warn(priv->netdev,
 +                                  "mlx5: rewrite of few sub-fields (mask %lx) isn't offloaded\n",
 +                                  mask);
                        return -EOPNOTSUPP;
                }
  
@@@ -3339,10 -3370,10 +3357,10 @@@ static int validate_goto_chain(struct m
  
  static int parse_tc_nic_actions(struct mlx5e_priv *priv,
                                struct flow_action *flow_action,
 -                              struct mlx5e_tc_flow_parse_attr *parse_attr,
                                struct mlx5e_tc_flow *flow,
                                struct netlink_ext_ack *extack)
  {
 +      struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5_flow_attr *attr = flow->attr;
        struct pedit_headers_action hdrs[2] = {};
        const struct flow_action_entry *act;
                return -EOPNOTSUPP;
  
        nic_attr = attr->nic_attr;
 -
        nic_attr->flow_tag = MLX5_FS_DEFAULT_FLOW_TAG;
 +      parse_attr = attr->parse_attr;
  
        flow_action_for_each(i, act, flow_action) {
                switch (act->id) {
                                  MLX5_FLOW_CONTEXT_ACTION_COUNT;
                        break;
                case FLOW_ACTION_DROP:
 -                      action |= MLX5_FLOW_CONTEXT_ACTION_DROP;
 -                      if (MLX5_CAP_FLOWTABLE(priv->mdev,
 -                                             flow_table_properties_nic_receive.flow_counter))
 -                              action |= MLX5_FLOW_CONTEXT_ACTION_COUNT;
 +                      action |= MLX5_FLOW_CONTEXT_ACTION_DROP |
 +                                MLX5_FLOW_CONTEXT_ACTION_COUNT;
                        break;
                case FLOW_ACTION_MANGLE:
                case FLOW_ACTION_ADD:
                                                   "device is not on same HW, can't offload");
                                netdev_warn(priv->netdev, "device %s not on same HW, can't offload\n",
                                            peer_dev->name);
 -                              return -EINVAL;
 +                              return -EOPNOTSUPP;
                        }
                        }
                        break;
                        if (mark & ~MLX5E_TC_FLOW_ID_MASK) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "Bad flow mark - only 16 bit is supported");
 -                              return -EINVAL;
 +                              return -EOPNOTSUPP;
                        }
  
                        nic_attr->flow_tag = mark;
@@@ -3717,19 -3750,20 +3735,19 @@@ static int verify_uplink_forwarding(str
  static int parse_tc_fdb_actions(struct mlx5e_priv *priv,
                                struct flow_action *flow_action,
                                struct mlx5e_tc_flow *flow,
 -                              struct netlink_ext_ack *extack,
 -                              struct net_device *filter_dev)
 +                              struct netlink_ext_ack *extack)
  {
        struct pedit_headers_action hdrs[2] = {};
        struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
        struct mlx5e_tc_flow_parse_attr *parse_attr;
        struct mlx5e_rep_priv *rpriv = priv->ppriv;
 +      struct mlx5e_sample_attr sample_attr = {};
        const struct ip_tunnel_info *info = NULL;
        struct mlx5_flow_attr *attr = flow->attr;
        int ifindexes[MLX5_MAX_FLOW_FWD_VPORTS];
        bool ft_flow = mlx5e_is_ft_flow(flow);
        const struct flow_action_entry *act;
        struct mlx5_esw_flow_attr *esw_attr;
 -      struct mlx5_sample_attr sample = {};
        bool encap = false, decap = false;
        u32 action = attr->action;
        int err, i, if_count = 0;
                                                   "mpls pop supported only as first action");
                                return -EOPNOTSUPP;
                        }
 -                      if (!netif_is_bareudp(filter_dev)) {
 +                      if (!netif_is_bareudp(parse_attr->filter_dev)) {
                                NL_SET_ERR_MSG_MOD(extack,
                                                   "mpls pop supported only on bareudp devices");
                                return -EOPNOTSUPP;
                                            "devices %s %s not on same switch HW, can't offload forwarding\n",
                                            priv->netdev->name,
                                            out_dev->name);
 -                              return -EINVAL;
 +                              return -EOPNOTSUPP;
                        }
                        }
                        break;
                                NL_SET_ERR_MSG_MOD(extack, "Sample action with connection tracking is not supported");
                                return -EOPNOTSUPP;
                        }
 -                      sample.rate = act->sample.rate;
 -                      sample.group_num = act->sample.psample_group->group_num;
 +                      sample_attr.rate = act->sample.rate;
 +                      sample_attr.group_num = act->sample.psample_group->group_num;
                        if (act->sample.truncate)
 -                              sample.trunc_size = act->sample.trunc_size;
 +                              sample_attr.trunc_size = act->sample.trunc_size;
                        flow_flag_set(flow, SAMPLE);
                        break;
                default:
         * no errors after parsing.
         */
        if (flow_flag_test(flow, SAMPLE)) {
 -              esw_attr->sample = kzalloc(sizeof(*esw_attr->sample), GFP_KERNEL);
 -              if (!esw_attr->sample)
 +              attr->sample_attr = kzalloc(sizeof(*attr->sample_attr), GFP_KERNEL);
 +              if (!attr->sample_attr)
                        return -ENOMEM;
 -              *esw_attr->sample = sample;
 +              *attr->sample_attr = sample_attr;
        }
  
        return 0;
@@@ -4284,7 -4318,7 +4302,7 @@@ __mlx5e_add_fdb_flow(struct mlx5e_priv 
        if (err)
                goto err_free;
  
 -      err = parse_tc_fdb_actions(priv, &rule->action, flow, extack, filter_dev);
 +      err = parse_tc_fdb_actions(priv, &rule->action, flow, extack);
        if (err)
                goto err_free;
  
@@@ -4430,11 -4464,11 +4448,11 @@@ mlx5e_add_nic_flow(struct mlx5e_priv *p
        if (err)
                goto err_free;
  
 -      err = parse_tc_nic_actions(priv, &rule->action, parse_attr, flow, extack);
 +      err = parse_tc_nic_actions(priv, &rule->action, flow, extack);
        if (err)
                goto err_free;
  
 -      err = mlx5e_tc_add_nic_flow(priv, parse_attr, flow, extack);
 +      err = mlx5e_tc_add_nic_flow(priv, flow, extack);
        if (err)
                goto err_free;
  
@@@ -4689,7 -4723,7 +4707,7 @@@ static int apply_police_params(struct m
                rate_mbps = max_t(u32, rate, 1);
        }
  
 -      err = mlx5_esw_modify_vport_rate(esw, vport_num, rate_mbps);
 +      err = mlx5_esw_qos_modify_vport_rate(esw, vport_num, rate_mbps);
        if (err)
                NL_SET_ERR_MSG_MOD(extack, "failed applying action to hardware");
  
@@@ -4861,7 -4895,6 +4879,7 @@@ int mlx5e_tc_nic_init(struct mlx5e_pri
        struct mlx5_core_dev *dev = priv->mdev;
        struct mapping_ctx *chains_mapping;
        struct mlx5_chains_attr attr = {};
 +      u64 mapping_id;
        int err;
  
        mlx5e_mod_hdr_tbl_init(&tc->mod_hdr);
  
        lockdep_set_class(&tc->ht.mutex, &tc_ht_lock_key);
  
 -      chains_mapping = mapping_create(sizeof(struct mlx5_mapped_obj),
 -                                      MLX5E_TC_TABLE_CHAIN_TAG_MASK, true);
 +      mapping_id = mlx5_query_nic_system_image_guid(dev);
 +
 +      chains_mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_CHAIN,
 +                                             sizeof(struct mlx5_mapped_obj),
 +                                             MLX5E_TC_TABLE_CHAIN_TAG_MASK, true);
 +
        if (IS_ERR(chains_mapping)) {
                err = PTR_ERR(chains_mapping);
                goto err_mapping;
                goto err_chains;
        }
  
 +      tc->post_act = mlx5e_tc_post_act_init(priv, tc->chains, MLX5_FLOW_NAMESPACE_KERNEL);
        tc->ct = mlx5_tc_ct_init(priv, tc->chains, &priv->fs.tc.mod_hdr,
 -                               MLX5_FLOW_NAMESPACE_KERNEL);
 +                               MLX5_FLOW_NAMESPACE_KERNEL, tc->post_act);
  
        tc->netdevice_nb.notifier_call = mlx5e_tc_netdev_event;
        err = register_netdevice_notifier_dev_net(priv->netdev,
  
  err_reg:
        mlx5_tc_ct_clean(tc->ct);
 +      mlx5e_tc_post_act_destroy(tc->post_act);
        mlx5_chains_destroy(tc->chains);
  err_chains:
        mapping_destroy(chains_mapping);
@@@ -4959,7 -4986,6 +4977,7 @@@ void mlx5e_tc_nic_cleanup(struct mlx5e_
        mutex_destroy(&tc->t_lock);
  
        mlx5_tc_ct_clean(tc->ct);
 +      mlx5e_tc_post_act_destroy(tc->post_act);
        mapping_destroy(tc->mapping);
        mlx5_chains_destroy(tc->chains);
  }
@@@ -4972,7 -4998,6 +4990,7 @@@ int mlx5e_tc_esw_init(struct rhashtabl
        struct mapping_ctx *mapping;
        struct mlx5_eswitch *esw;
        struct mlx5e_priv *priv;
 +      u64 mapping_id;
        int err = 0;
  
        uplink_priv = container_of(tc_ht, struct mlx5_rep_uplink_priv, tc_ht);
        priv = netdev_priv(rpriv->netdev);
        esw = priv->mdev->priv.eswitch;
  
 +      uplink_priv->post_act = mlx5e_tc_post_act_init(priv, esw_chains(esw),
 +                                                     MLX5_FLOW_NAMESPACE_FDB);
        uplink_priv->ct_priv = mlx5_tc_ct_init(netdev_priv(priv->netdev),
                                               esw_chains(esw),
                                               &esw->offloads.mod_hdr,
 -                                             MLX5_FLOW_NAMESPACE_FDB);
 +                                             MLX5_FLOW_NAMESPACE_FDB,
 +                                             uplink_priv->post_act);
  
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
 -      uplink_priv->esw_psample = mlx5_esw_sample_init(netdev_priv(priv->netdev));
 +      uplink_priv->tc_psample = mlx5e_tc_sample_init(esw, uplink_priv->post_act);
  #endif
  
 -      mapping = mapping_create(sizeof(struct tunnel_match_key),
 -                               TUNNEL_INFO_BITS_MASK, true);
 +      mapping_id = mlx5_query_nic_system_image_guid(esw->dev);
 +
 +      mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL,
 +                                      sizeof(struct tunnel_match_key),
 +                                      TUNNEL_INFO_BITS_MASK, true);
 +
        if (IS_ERR(mapping)) {
                err = PTR_ERR(mapping);
                goto err_tun_mapping;
        uplink_priv->tunnel_mapping = mapping;
  
        /* 0xFFF is reserved for stack devices slow path table mark */
 -      mapping = mapping_create(sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true);
 +      mapping = mapping_create_for_id(mapping_id, MAPPING_TYPE_TUNNEL_ENC_OPTS,
 +                                      sz_enc_opts, ENC_OPTS_BITS_MASK - 1, true);
        if (IS_ERR(mapping)) {
                err = PTR_ERR(mapping);
                goto err_enc_opts_mapping;
@@@ -5035,12 -5052,11 +5053,12 @@@ err_enc_opts_mapping
        mapping_destroy(uplink_priv->tunnel_mapping);
  err_tun_mapping:
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
 -      mlx5_esw_sample_cleanup(uplink_priv->esw_psample);
 +      mlx5e_tc_sample_cleanup(uplink_priv->tc_psample);
  #endif
        mlx5_tc_ct_clean(uplink_priv->ct_priv);
        netdev_warn(priv->netdev,
                    "Failed to initialize tc (eswitch), err: %d", err);
 +      mlx5e_tc_post_act_destroy(uplink_priv->post_act);
        return err;
  }
  
@@@ -5057,10 -5073,9 +5075,10 @@@ void mlx5e_tc_esw_cleanup(struct rhasht
        mapping_destroy(uplink_priv->tunnel_mapping);
  
  #if IS_ENABLED(CONFIG_MLX5_TC_SAMPLE)
 -      mlx5_esw_sample_cleanup(uplink_priv->esw_psample);
 +      mlx5e_tc_sample_cleanup(uplink_priv->tc_psample);
  #endif
        mlx5_tc_ct_clean(uplink_priv->ct_priv);
 +      mlx5e_tc_post_act_destroy(uplink_priv->post_act);
  }
  
  int mlx5e_tc_num_filters(struct mlx5e_priv *priv, unsigned long flags)
@@@ -32,9 -32,7 +32,9 @@@
  
  #include <linux/netdevice.h>
  #include <linux/mlx5/driver.h>
 +#include <linux/mlx5/eswitch.h>
  #include <linux/mlx5/vport.h>
 +#include "lib/devcom.h"
  #include "mlx5_core.h"
  #include "eswitch.h"
  #include "lag.h"
@@@ -47,7 -45,7 +47,7 @@@
  static DEFINE_SPINLOCK(lag_lock);
  
  static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 remap_port1,
 -                             u8 remap_port2)
 +                             u8 remap_port2, bool shared_fdb)
  {
        u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
        void *lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
@@@ -56,7 -54,6 +56,7 @@@
  
        MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, remap_port1);
        MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, remap_port2);
 +      MLX5_SET(lagc, lag_ctx, fdb_selection_mode, shared_fdb);
  
        return mlx5_cmd_exec_in(dev, create_lag, in);
  }
@@@ -227,59 -224,35 +227,59 @@@ void mlx5_modify_lag(struct mlx5_lag *l
  }
  
  static int mlx5_create_lag(struct mlx5_lag *ldev,
 -                         struct lag_tracker *tracker)
 +                         struct lag_tracker *tracker,
 +                         bool shared_fdb)
  {
        struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
 +      struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
 +      u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
        int err;
  
        mlx5_infer_tx_affinity_mapping(tracker, &ldev->v2p_map[MLX5_LAG_P1],
                                       &ldev->v2p_map[MLX5_LAG_P2]);
  
 -      mlx5_core_info(dev0, "lag map port 1:%d port 2:%d",
 -                     ldev->v2p_map[MLX5_LAG_P1], ldev->v2p_map[MLX5_LAG_P2]);
 +      mlx5_core_info(dev0, "lag map port 1:%d port 2:%d shared_fdb:%d",
 +                     ldev->v2p_map[MLX5_LAG_P1], ldev->v2p_map[MLX5_LAG_P2],
 +                     shared_fdb);
  
        err = mlx5_cmd_create_lag(dev0, ldev->v2p_map[MLX5_LAG_P1],
 -                                ldev->v2p_map[MLX5_LAG_P2]);
 -      if (err)
 +                                ldev->v2p_map[MLX5_LAG_P2], shared_fdb);
 +      if (err) {
                mlx5_core_err(dev0,
                              "Failed to create LAG (%d)\n",
                              err);
 +              return err;
 +      }
 +
 +      if (shared_fdb) {
 +              err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
 +                                                            dev1->priv.eswitch);
 +              if (err)
 +                      mlx5_core_err(dev0, "Can't enable single FDB mode\n");
 +              else
 +                      mlx5_core_info(dev0, "Operation mode is single FDB\n");
 +      }
 +
 +      if (err) {
 +              MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
 +              if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
 +                      mlx5_core_err(dev0,
 +                                    "Failed to deactivate RoCE LAG; driver restart required\n");
 +      }
 +
        return err;
  }
  
  int mlx5_activate_lag(struct mlx5_lag *ldev,
                      struct lag_tracker *tracker,
 -                    u8 flags)
 +                    u8 flags,
 +                    bool shared_fdb)
  {
        bool roce_lag = !!(flags & MLX5_LAG_FLAG_ROCE);
        struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
        int err;
  
 -      err = mlx5_create_lag(ldev, tracker);
 +      err = mlx5_create_lag(ldev, tracker, shared_fdb);
        if (err) {
                if (roce_lag) {
                        mlx5_core_err(dev0,
        }
  
        ldev->flags |= flags;
 +      ldev->shared_fdb = shared_fdb;
        return 0;
  }
  
@@@ -305,13 -277,8 +305,14 @@@ static int mlx5_deactivate_lag(struct m
        int err;
  
        ldev->flags &= ~MLX5_LAG_MODE_FLAGS;
+       mlx5_lag_mp_reset(ldev);
  
 +      if (ldev->shared_fdb) {
 +              mlx5_eswitch_offloads_destroy_single_fdb(ldev->pf[MLX5_LAG_P1].dev->priv.eswitch,
 +                                                       ldev->pf[MLX5_LAG_P2].dev->priv.eswitch);
 +              ldev->shared_fdb = false;
 +      }
 +
        MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
        err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
        if (err) {
@@@ -367,10 -334,6 +368,10 @@@ static void mlx5_lag_remove_devices(str
                if (!ldev->pf[i].dev)
                        continue;
  
 +              if (ldev->pf[i].dev->priv.flags &
 +                  MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
 +                      continue;
 +
                ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
                mlx5_rescan_drivers_locked(ldev->pf[i].dev);
        }
@@@ -380,15 -343,12 +381,15 @@@ static void mlx5_disable_lag(struct mlx
  {
        struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
        struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
 +      bool shared_fdb = ldev->shared_fdb;
        bool roce_lag;
        int err;
  
        roce_lag = __mlx5_lag_is_roce(ldev);
  
 -      if (roce_lag) {
 +      if (shared_fdb) {
 +              mlx5_lag_remove_devices(ldev);
 +      } else if (roce_lag) {
                if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
                        dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
                        mlx5_rescan_drivers_locked(dev0);
        if (err)
                return;
  
 -      if (roce_lag)
 +      if (shared_fdb || roce_lag)
                mlx5_lag_add_devices(ldev);
 +
 +      if (shared_fdb) {
 +              if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
 +                      mlx5_eswitch_reload_reps(dev0->priv.eswitch);
 +              if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
 +                      mlx5_eswitch_reload_reps(dev1->priv.eswitch);
 +      }
 +}
 +
 +static bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
 +{
 +      struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
 +      struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
 +
 +      if (is_mdev_switchdev_mode(dev0) &&
 +          is_mdev_switchdev_mode(dev1) &&
 +          mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
 +          mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
 +          mlx5_devcom_is_paired(dev0->priv.devcom,
 +                                MLX5_DEVCOM_ESW_OFFLOADS) &&
 +          MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
 +          MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
 +          MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
 +              return true;
 +
 +      return false;
  }
  
  static void mlx5_do_bond(struct mlx5_lag *ldev)
        bool do_bond, roce_lag;
        int err;
  
 -      if (!mlx5_lag_is_ready(ldev))
 -              return;
 -
 -      tracker = ldev->tracker;
 +      if (!mlx5_lag_is_ready(ldev)) {
 +              do_bond = false;
 +      } else {
 +              tracker = ldev->tracker;
  
 -      do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
 +              do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
 +      }
  
        if (do_bond && !__mlx5_lag_is_active(ldev)) {
 +              bool shared_fdb = mlx5_shared_fdb_supported(ldev);
 +
                roce_lag = !mlx5_sriov_is_enabled(dev0) &&
                           !mlx5_sriov_is_enabled(dev1);
  
                           dev1->priv.eswitch->mode == MLX5_ESWITCH_NONE;
  #endif
  
 -              if (roce_lag)
 +              if (shared_fdb || roce_lag)
                        mlx5_lag_remove_devices(ldev);
  
                err = mlx5_activate_lag(ldev, &tracker,
                                        roce_lag ? MLX5_LAG_FLAG_ROCE :
 -                                      MLX5_LAG_FLAG_SRIOV);
 +                                                 MLX5_LAG_FLAG_SRIOV,
 +                                      shared_fdb);
                if (err) {
 -                      if (roce_lag)
 +                      if (shared_fdb || roce_lag)
                                mlx5_lag_add_devices(ldev);
  
                        return;
 -              }
 -
 -              if (roce_lag) {
 +              } else if (roce_lag) {
                        dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
                        mlx5_rescan_drivers_locked(dev0);
                        mlx5_nic_vport_enable_roce(dev1);
 +              } else if (shared_fdb) {
 +                      dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 +                      mlx5_rescan_drivers_locked(dev0);
 +
 +                      err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
 +                      if (!err)
 +                              err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
 +
 +                      if (err) {
 +                              dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
 +                              mlx5_rescan_drivers_locked(dev0);
 +                              mlx5_deactivate_lag(ldev);
 +                              mlx5_lag_add_devices(ldev);
 +                              mlx5_eswitch_reload_reps(dev0->priv.eswitch);
 +                              mlx5_eswitch_reload_reps(dev1->priv.eswitch);
 +                              mlx5_core_err(dev0, "Failed to enable lag\n");
 +                              return;
 +                      }
                }
        } else if (do_bond && __mlx5_lag_is_active(ldev)) {
                mlx5_modify_lag(ldev, &tracker);
@@@ -505,48 -419,21 +506,48 @@@ static void mlx5_queue_bond_work(struc
        queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
  }
  
 +static void mlx5_lag_lock_eswitches(struct mlx5_core_dev *dev0,
 +                                  struct mlx5_core_dev *dev1)
 +{
 +      if (dev0)
 +              mlx5_esw_lock(dev0->priv.eswitch);
 +      if (dev1)
 +              mlx5_esw_lock(dev1->priv.eswitch);
 +}
 +
 +static void mlx5_lag_unlock_eswitches(struct mlx5_core_dev *dev0,
 +                                    struct mlx5_core_dev *dev1)
 +{
 +      if (dev1)
 +              mlx5_esw_unlock(dev1->priv.eswitch);
 +      if (dev0)
 +              mlx5_esw_unlock(dev0->priv.eswitch);
 +}
 +
  static void mlx5_do_bond_work(struct work_struct *work)
  {
        struct delayed_work *delayed_work = to_delayed_work(work);
        struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
                                             bond_work);
 +      struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
 +      struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
        int status;
  
        status = mlx5_dev_list_trylock();
        if (!status) {
 -              /* 1 sec delay. */
                mlx5_queue_bond_work(ldev, HZ);
                return;
        }
  
 +      if (ldev->mode_changes_in_progress) {
 +              mlx5_dev_list_unlock();
 +              mlx5_queue_bond_work(ldev, HZ);
 +              return;
 +      }
 +
 +      mlx5_lag_lock_eswitches(dev0, dev1);
        mlx5_do_bond(ldev);
 +      mlx5_lag_unlock_eswitches(dev0, dev1);
        mlx5_dev_list_unlock();
  }
  
@@@ -744,7 -631,7 +745,7 @@@ static void mlx5_ldev_remove_mdev(struc
  }
  
  /* Must be called with intf_mutex held */
 -static void __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
 +static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
  {
        struct mlx5_lag *ldev = NULL;
        struct mlx5_core_dev *tmp_dev;
        if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
            !MLX5_CAP_GEN(dev, lag_master) ||
            MLX5_CAP_GEN(dev, num_lag_ports) != MLX5_MAX_PORTS)
 -              return;
 +              return 0;
  
        tmp_dev = mlx5_get_next_phys_dev(dev);
        if (tmp_dev)
                ldev = mlx5_lag_dev_alloc(dev);
                if (!ldev) {
                        mlx5_core_err(dev, "Failed to alloc lag dev\n");
 -                      return;
 +                      return 0;
                }
        } else {
 +              if (ldev->mode_changes_in_progress)
 +                      return -EAGAIN;
                mlx5_ldev_get(ldev);
        }
  
        mlx5_ldev_add_mdev(ldev, dev);
  
 -      return;
 +      return 0;
  }
  
  void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
        if (!ldev)
                return;
  
 +recheck:
        mlx5_dev_list_lock();
 +      if (ldev->mode_changes_in_progress) {
 +              mlx5_dev_list_unlock();
 +              msleep(100);
 +              goto recheck;
 +      }
        mlx5_ldev_remove_mdev(ldev, dev);
        mlx5_dev_list_unlock();
        mlx5_ldev_put(ldev);
  
  void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
  {
 +      int err;
 +
 +recheck:
        mlx5_dev_list_lock();
 -      __mlx5_lag_dev_add_mdev(dev);
 +      err = __mlx5_lag_dev_add_mdev(dev);
 +      if (err) {
 +              mlx5_dev_list_unlock();
 +              msleep(100);
 +              goto recheck;
 +      }
        mlx5_dev_list_unlock();
  }
  
@@@ -820,11 -691,11 +821,11 @@@ void mlx5_lag_remove_netdev(struct mlx5
        if (!ldev)
                return;
  
 -      if (__mlx5_lag_is_active(ldev))
 -              mlx5_disable_lag(ldev);
 -
        mlx5_ldev_remove_netdev(ldev, netdev);
        ldev->flags &= ~MLX5_LAG_FLAG_READY;
 +
 +      if (__mlx5_lag_is_active(ldev))
 +              mlx5_queue_bond_work(ldev, 0);
  }
  
  /* Must be called with intf_mutex held */
@@@ -846,7 -717,6 +847,7 @@@ void mlx5_lag_add_netdev(struct mlx5_co
  
        if (i >= MLX5_MAX_PORTS)
                ldev->flags |= MLX5_LAG_FLAG_READY;
 +      mlx5_queue_bond_work(ldev, 0);
  }
  
  bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
@@@ -877,21 -747,6 +878,21 @@@ bool mlx5_lag_is_active(struct mlx5_cor
  }
  EXPORT_SYMBOL(mlx5_lag_is_active);
  
 +bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_lag *ldev;
 +      bool res;
 +
 +      spin_lock(&lag_lock);
 +      ldev = mlx5_lag_dev(dev);
 +      res = ldev && __mlx5_lag_is_active(ldev) &&
 +              dev == ldev->pf[MLX5_LAG_P1].dev;
 +      spin_unlock(&lag_lock);
 +
 +      return res;
 +}
 +EXPORT_SYMBOL(mlx5_lag_is_master);
 +
  bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
  {
        struct mlx5_lag *ldev;
  }
  EXPORT_SYMBOL(mlx5_lag_is_sriov);
  
 -void mlx5_lag_update(struct mlx5_core_dev *dev)
 +bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_lag *ldev;
 +      bool res;
 +
 +      spin_lock(&lag_lock);
 +      ldev = mlx5_lag_dev(dev);
 +      res = ldev && __mlx5_lag_is_sriov(ldev) && ldev->shared_fdb;
 +      spin_unlock(&lag_lock);
 +
 +      return res;
 +}
 +EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
 +
 +void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
  {
 +      struct mlx5_core_dev *dev0;
 +      struct mlx5_core_dev *dev1;
        struct mlx5_lag *ldev;
  
        mlx5_dev_list_lock();
 +
        ldev = mlx5_lag_dev(dev);
 -      if (!ldev)
 -              goto unlock;
 +      dev0 = ldev->pf[MLX5_LAG_P1].dev;
 +      dev1 = ldev->pf[MLX5_LAG_P2].dev;
  
 -      mlx5_do_bond(ldev);
 +      ldev->mode_changes_in_progress++;
 +      if (__mlx5_lag_is_active(ldev)) {
 +              mlx5_lag_lock_eswitches(dev0, dev1);
 +              mlx5_disable_lag(ldev);
 +              mlx5_lag_unlock_eswitches(dev0, dev1);
 +      }
 +      mlx5_dev_list_unlock();
 +}
  
 -unlock:
 +void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_lag *ldev;
 +
 +      mlx5_dev_list_lock();
 +      ldev = mlx5_lag_dev(dev);
 +      ldev->mode_changes_in_progress--;
        mlx5_dev_list_unlock();
 +      mlx5_queue_bond_work(ldev, 0);
  }
  
  struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
@@@ -1004,26 -828,6 +1005,26 @@@ unlock
  }
  EXPORT_SYMBOL(mlx5_lag_get_slave_port);
  
 +struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_core_dev *peer_dev = NULL;
 +      struct mlx5_lag *ldev;
 +
 +      spin_lock(&lag_lock);
 +      ldev = mlx5_lag_dev(dev);
 +      if (!ldev)
 +              goto unlock;
 +
 +      peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
 +                         ldev->pf[MLX5_LAG_P2].dev :
 +                         ldev->pf[MLX5_LAG_P1].dev;
 +
 +unlock:
 +      spin_unlock(&lag_lock);
 +      return peer_dev;
 +}
 +EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
 +
  int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
                                 u64 *values,
                                 int num_counters,
@@@ -161,7 -161,7 +161,7 @@@ static void mlx5_lag_fib_route_event(st
                struct lag_tracker tracker;
  
                tracker = ldev->tracker;
 -              mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH);
 +              mlx5_activate_lag(ldev, &tracker, MLX5_LAG_FLAG_MULTIPATH, false);
        }
  
        mlx5_lag_set_port_affinity(ldev, MLX5_LAG_NORMAL_AFFINITY);
@@@ -302,6 -302,14 +302,14 @@@ static int mlx5_lag_fib_event(struct no
        return NOTIFY_DONE;
  }
  
+ void mlx5_lag_mp_reset(struct mlx5_lag *ldev)
+ {
+       /* Clear mfi, as it might become stale when a route delete event
+        * has been missed, see mlx5_lag_fib_route_event().
+        */
+       ldev->lag_mp.mfi = NULL;
+ }
  int mlx5_lag_mp_init(struct mlx5_lag *ldev)
  {
        struct lag_mp *mp = &ldev->lag_mp;
@@@ -81,7 -81,6 +81,7 @@@ dr_rule_create_collision_entry(struct m
        }
  
        ste->ste_chain_location = orig_ste->ste_chain_location;
 +      ste->htbl->pointing_ste = orig_ste->htbl->pointing_ste;
  
        /* In collision entry, all members share the same miss_list_head */
        ste->htbl->miss_list = mlx5dr_ste_get_miss_list(orig_ste);
@@@ -186,9 -185,6 +186,9 @@@ dr_rule_rehash_handle_collision(struct 
        if (!new_ste)
                return NULL;
  
 +      /* Update collision pointing STE */
 +      new_ste->htbl->pointing_ste = col_ste->htbl->pointing_ste;
 +
        /* In collision entry, all members share the same miss_list_head */
        new_ste->htbl->miss_list = mlx5dr_ste_get_miss_list(col_ste);
  
@@@ -216,7 -212,7 +216,7 @@@ static void dr_rule_rehash_copy_ste_ctr
        new_ste->next_htbl = cur_ste->next_htbl;
        new_ste->ste_chain_location = cur_ste->ste_chain_location;
  
 -      if (!mlx5dr_ste_is_last_in_rule(nic_matcher, new_ste->ste_chain_location))
 +      if (new_ste->next_htbl)
                new_ste->next_htbl->pointing_ste = new_ste;
  
        /* We need to copy the refcount since this ste
         */
        new_ste->refcount = cur_ste->refcount;
  
 -      /* Link old STEs rule_mem list to the new ste */
 -      mlx5dr_rule_update_rule_member(cur_ste, new_ste);
 -      INIT_LIST_HEAD(&new_ste->rule_list);
 -      list_splice_tail_init(&cur_ste->rule_list, &new_ste->rule_list);
 +      /* Link old STEs rule to the new ste */
 +      mlx5dr_rule_set_last_member(cur_ste->rule_rx_tx, new_ste, false);
  }
  
  static struct mlx5dr_ste *
@@@ -406,7 -404,7 +406,7 @@@ dr_rule_rehash_htbl(struct mlx5dr_rule 
        info.miss_icm_addr = nic_matcher->e_anchor->chunk->icm_addr;
        mlx5dr_ste_set_formatted_ste(dmn->ste_ctx,
                                     dmn->info.caps.gvmi,
 -                                   nic_dmn,
 +                                   nic_dmn->type,
                                     new_htbl,
                                     formatted_ste,
                                     &info);
@@@ -583,66 -581,34 +583,66 @@@ free_action_members
        return -ENOMEM;
  }
  
 -/* While the pointer of ste is no longer valid, like while moving ste to be
 - * the first in the miss_list, and to be in the origin table,
 - * all rule-members that are attached to this ste should update their ste member
 - * to the new pointer
 - */
 -void mlx5dr_rule_update_rule_member(struct mlx5dr_ste *ste,
 -                                  struct mlx5dr_ste *new_ste)
 +void mlx5dr_rule_set_last_member(struct mlx5dr_rule_rx_tx *nic_rule,
 +                               struct mlx5dr_ste *ste,
 +                               bool force)
 +{
 +      /* Update rule member is usually done for the last STE or during rule
 +       * creation to recover from mid-creation failure (for this peruse the
 +       * force flag is used)
 +       */
 +      if (ste->next_htbl && !force)
 +              return;
 +
 +      /* Update is required since each rule keeps track of its last STE */
 +      ste->rule_rx_tx = nic_rule;
 +      nic_rule->last_rule_ste = ste;
 +}
 +
 +static struct mlx5dr_ste *dr_rule_get_pointed_ste(struct mlx5dr_ste *curr_ste)
 +{
 +      struct mlx5dr_ste *first_ste;
 +
 +      first_ste = list_first_entry(mlx5dr_ste_get_miss_list(curr_ste),
 +                                   struct mlx5dr_ste, miss_list_node);
 +
 +      return first_ste->htbl->pointing_ste;
 +}
 +
 +int mlx5dr_rule_get_reverse_rule_members(struct mlx5dr_ste **ste_arr,
 +                                       struct mlx5dr_ste *curr_ste,
 +                                       int *num_of_stes)
  {
 -      struct mlx5dr_rule_member *rule_mem;
 +      bool first = false;
 +
 +      *num_of_stes = 0;
 +
 +      if (!curr_ste)
 +              return -ENOENT;
 +
 +      /* Iterate from last to first */
 +      while (!first) {
 +              first = curr_ste->ste_chain_location == 1;
 +              ste_arr[*num_of_stes] = curr_ste;
 +              *num_of_stes += 1;
 +              curr_ste = dr_rule_get_pointed_ste(curr_ste);
 +      }
  
 -      list_for_each_entry(rule_mem, &ste->rule_list, use_ste_list)
 -              rule_mem->ste = new_ste;
 +      return 0;
  }
  
  static void dr_rule_clean_rule_members(struct mlx5dr_rule *rule,
                                       struct mlx5dr_rule_rx_tx *nic_rule)
  {
 -      struct mlx5dr_rule_member *rule_mem;
 -      struct mlx5dr_rule_member *tmp_mem;
 +      struct mlx5dr_ste *ste_arr[DR_RULE_MAX_STES + DR_ACTION_MAX_STES];
 +      struct mlx5dr_ste *curr_ste = nic_rule->last_rule_ste;
 +      int i;
  
 -      if (list_empty(&nic_rule->rule_members_list))
 +      if (mlx5dr_rule_get_reverse_rule_members(ste_arr, curr_ste, &i))
                return;
 -      list_for_each_entry_safe(rule_mem, tmp_mem, &nic_rule->rule_members_list, list) {
 -              list_del(&rule_mem->list);
 -              list_del(&rule_mem->use_ste_list);
 -              mlx5dr_ste_put(rule_mem->ste, rule->matcher, nic_rule->nic_matcher);
 -              kvfree(rule_mem);
 -      }
 +
 +      while (i--)
 +              mlx5dr_ste_put(ste_arr[i], rule->matcher, nic_rule->nic_matcher);
  }
  
  static u16 dr_get_bits_per_mask(u16 byte_mask)
@@@ -662,25 -628,43 +662,25 @@@ static bool dr_rule_need_enlarge_hash(s
                                      struct mlx5dr_domain_rx_tx *nic_dmn)
  {
        struct mlx5dr_ste_htbl_ctrl *ctrl = &htbl->ctrl;
 +      int threshold;
  
        if (dmn->info.max_log_sw_icm_sz <= htbl->chunk_size)
                return false;
  
 -      if (!ctrl->may_grow)
 +      if (!mlx5dr_ste_htbl_may_grow(htbl))
                return false;
  
        if (dr_get_bits_per_mask(htbl->byte_mask) * BITS_PER_BYTE <= htbl->chunk_size)
                return false;
  
 -      if (ctrl->num_of_collisions >= ctrl->increase_threshold &&
 -          (ctrl->num_of_valid_entries - ctrl->num_of_collisions) >= ctrl->increase_threshold)
 +      threshold = mlx5dr_ste_htbl_increase_threshold(htbl);
 +      if (ctrl->num_of_collisions >= threshold &&
 +          (ctrl->num_of_valid_entries - ctrl->num_of_collisions) >= threshold)
                return true;
  
        return false;
  }
  
 -static int dr_rule_add_member(struct mlx5dr_rule_rx_tx *nic_rule,
 -                            struct mlx5dr_ste *ste)
 -{
 -      struct mlx5dr_rule_member *rule_mem;
 -
 -      rule_mem = kvzalloc(sizeof(*rule_mem), GFP_KERNEL);
 -      if (!rule_mem)
 -              return -ENOMEM;
 -
 -      INIT_LIST_HEAD(&rule_mem->list);
 -      INIT_LIST_HEAD(&rule_mem->use_ste_list);
 -
 -      rule_mem->ste = ste;
 -      list_add_tail(&rule_mem->list, &nic_rule->rule_members_list);
 -
 -      list_add_tail(&rule_mem->use_ste_list, &ste->rule_list);
 -
 -      return 0;
 -}
 -
  static int dr_rule_handle_action_stes(struct mlx5dr_rule *rule,
                                      struct mlx5dr_rule_rx_tx *nic_rule,
                                      struct list_head *send_ste_list,
        struct mlx5dr_domain *dmn = matcher->tbl->dmn;
        u8 *curr_hw_ste, *prev_hw_ste;
        struct mlx5dr_ste *action_ste;
 -      int i, k, ret;
 +      int i, k;
  
        /* Two cases:
         * 1. num_of_builders is equal to new_hw_ste_arr_sz, the action in the ste
         * 2. num_of_builders is less then new_hw_ste_arr_sz, new ste was added
         *    to support the action.
         */
 -      if (num_of_builders == new_hw_ste_arr_sz)
 -              return 0;
  
        for (i = num_of_builders, k = 0; i < new_hw_ste_arr_sz; i++, k++) {
                curr_hw_ste = hw_ste_arr + i * DR_STE_SIZE;
  
                mlx5dr_ste_get(action_ste);
  
 +              action_ste->htbl->pointing_ste = last_ste;
 +              last_ste->next_htbl = action_ste->htbl;
 +              last_ste = action_ste;
 +
                /* While free ste we go over the miss list, so add this ste to the list */
                list_add_tail(&action_ste->miss_list_node,
                              mlx5dr_ste_get_miss_list(action_ste));
                mlx5dr_ste_set_hit_addr_by_next_htbl(dmn->ste_ctx,
                                                     prev_hw_ste,
                                                     action_ste->htbl);
 -              ret = dr_rule_add_member(nic_rule, action_ste);
 -              if (ret) {
 -                      mlx5dr_dbg(dmn, "Failed adding rule member\n");
 -                      goto free_ste_info;
 -              }
 +
 +              mlx5dr_rule_set_last_member(nic_rule, action_ste, true);
 +
                mlx5dr_send_fill_and_append_ste_send_info(action_ste, DR_STE_SIZE, 0,
                                                          curr_hw_ste,
                                                          ste_info_arr[k],
                                                          send_ste_list, false);
        }
  
 +      last_ste->next_htbl = NULL;
 +
        return 0;
  
 -free_ste_info:
 -      kfree(ste_info_arr[k]);
  err_exit:
        mlx5dr_ste_put(action_ste, matcher, nic_matcher);
        return -ENOMEM;
@@@ -862,9 -846,9 +862,9 @@@ again
                        new_htbl = dr_rule_rehash(rule, nic_rule, cur_htbl,
                                                  ste_location, send_ste_list);
                        if (!new_htbl) {
-                               mlx5dr_htbl_put(cur_htbl);
                                mlx5dr_err(dmn, "Failed creating rehash table, htbl-log_size: %d\n",
                                           cur_htbl->chunk_size);
+                               mlx5dr_htbl_put(cur_htbl);
                        } else {
                                cur_htbl = new_htbl;
                        }
@@@ -1031,12 -1015,12 +1031,12 @@@ static enum mlx5dr_ipv dr_rule_get_ipv(
  }
  
  static bool dr_rule_skip(enum mlx5dr_domain_type domain,
 -                       enum mlx5dr_ste_entry_type ste_type,
 +                       enum mlx5dr_domain_nic_type nic_type,
                         struct mlx5dr_match_param *mask,
                         struct mlx5dr_match_param *value,
                         u32 flow_source)
  {
 -      bool rx = ste_type == MLX5DR_STE_TYPE_RX;
 +      bool rx = nic_type == DR_DOMAIN_NIC_TYPE_RX;
  
        if (domain != MLX5DR_DOMAIN_TYPE_FDB)
                return false;
@@@ -1081,7 -1065,9 +1081,7 @@@ dr_rule_create_rule_nic(struct mlx5dr_r
        nic_matcher = nic_rule->nic_matcher;
        nic_dmn = nic_matcher->nic_tbl->nic_dmn;
  
 -      INIT_LIST_HEAD(&nic_rule->rule_members_list);
 -
 -      if (dr_rule_skip(dmn->type, nic_dmn->ste_type, &matcher->mask, param,
 +      if (dr_rule_skip(dmn->type, nic_dmn->type, &matcher->mask, param,
                         rule->flow_source))
                return 0;
  
  
                cur_htbl = ste->next_htbl;
  
 -              /* Keep all STEs in the rule struct */
 -              ret = dr_rule_add_member(nic_rule, ste);
 -              if (ret) {
 -                      mlx5dr_dbg(dmn, "Failed adding rule member index %d\n", i);
 -                      goto free_ste;
 -              }
 -
                mlx5dr_ste_get(ste);
 +              mlx5dr_rule_set_last_member(nic_rule, ste, true);
        }
  
        /* Connect actions */
  
        return 0;
  
 -free_ste:
 -      mlx5dr_ste_put(ste, matcher, nic_matcher);
  free_rule:
        dr_rule_clean_rule_members(rule, nic_rule);
        /* Clean all ste_info's */
@@@ -28,7 -28,6 +28,7 @@@
  #include <linux/marvell_phy.h>
  #include <linux/phy.h>
  #include <linux/sfp.h>
 +#include <linux/netdevice.h>
  
  #define MV_PHY_ALASKA_NBT_QUIRK_MASK  0xfffffffe
  #define MV_PHY_ALASKA_NBT_QUIRK_REV   (MARVELL_PHY_ID_88X3310 | 0xa)
@@@ -105,16 -104,6 +105,16 @@@ enum 
        MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER_NO_SGMII_AN       = 0x5,
        MV_V2_33X0_PORT_CTRL_MACTYPE_10GBASER_RATE_MATCH        = 0x6,
        MV_V2_33X0_PORT_CTRL_MACTYPE_USXGMII                    = 0x7,
 +      MV_V2_PORT_INTR_STS     = 0xf040,
 +      MV_V2_PORT_INTR_MASK    = 0xf043,
 +      MV_V2_PORT_INTR_STS_WOL_EN      = BIT(8),
 +      MV_V2_MAGIC_PKT_WORD0   = 0xf06b,
 +      MV_V2_MAGIC_PKT_WORD1   = 0xf06c,
 +      MV_V2_MAGIC_PKT_WORD2   = 0xf06d,
 +      /* Wake on LAN registers */
 +      MV_V2_WOL_CTRL          = 0xf06e,
 +      MV_V2_WOL_CTRL_CLEAR_STS        = BIT(15),
 +      MV_V2_WOL_CTRL_MAGIC_PKT_EN     = BIT(0),
        /* Temperature control/read registers (88X3310 only) */
        MV_V2_TEMP_CTRL         = 0xf08a,
        MV_V2_TEMP_CTRL_MASK    = 0xc000,
@@@ -998,11 -987,19 +998,19 @@@ static int mv3310_get_number_of_ports(s
  
  static int mv3310_match_phy_device(struct phy_device *phydev)
  {
+       if ((phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] &
+            MARVELL_PHY_ID_MASK) != MARVELL_PHY_ID_88X3310)
+               return 0;
        return mv3310_get_number_of_ports(phydev) == 1;
  }
  
  static int mv3340_match_phy_device(struct phy_device *phydev)
  {
+       if ((phydev->c45_ids.device_ids[MDIO_MMD_PMAPMD] &
+            MARVELL_PHY_ID_MASK) != MARVELL_PHY_ID_88X3310)
+               return 0;
        return mv3310_get_number_of_ports(phydev) == 4;
  }
  
@@@ -1031,80 -1028,6 +1039,80 @@@ static int mv2111_match_phy_device(stru
        return mv211x_match_phy_device(phydev, false);
  }
  
 +static void mv3110_get_wol(struct phy_device *phydev,
 +                         struct ethtool_wolinfo *wol)
 +{
 +      int ret;
 +
 +      wol->supported = WAKE_MAGIC;
 +      wol->wolopts = 0;
 +
 +      ret = phy_read_mmd(phydev, MDIO_MMD_VEND2, MV_V2_WOL_CTRL);
 +      if (ret < 0)
 +              return;
 +
 +      if (ret & MV_V2_WOL_CTRL_MAGIC_PKT_EN)
 +              wol->wolopts |= WAKE_MAGIC;
 +}
 +
 +static int mv3110_set_wol(struct phy_device *phydev,
 +                        struct ethtool_wolinfo *wol)
 +{
 +      int ret;
 +
 +      if (wol->wolopts & WAKE_MAGIC) {
 +              /* Enable the WOL interrupt */
 +              ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2,
 +                                     MV_V2_PORT_INTR_MASK,
 +                                     MV_V2_PORT_INTR_STS_WOL_EN);
 +              if (ret < 0)
 +                      return ret;
 +
 +              /* Store the device address for the magic packet */
 +              ret = phy_write_mmd(phydev, MDIO_MMD_VEND2,
 +                                  MV_V2_MAGIC_PKT_WORD2,
 +                                  ((phydev->attached_dev->dev_addr[5] << 8) |
 +                                  phydev->attached_dev->dev_addr[4]));
 +              if (ret < 0)
 +                      return ret;
 +
 +              ret = phy_write_mmd(phydev, MDIO_MMD_VEND2,
 +                                  MV_V2_MAGIC_PKT_WORD1,
 +                                  ((phydev->attached_dev->dev_addr[3] << 8) |
 +                                  phydev->attached_dev->dev_addr[2]));
 +              if (ret < 0)
 +                      return ret;
 +
 +              ret = phy_write_mmd(phydev, MDIO_MMD_VEND2,
 +                                  MV_V2_MAGIC_PKT_WORD0,
 +                                  ((phydev->attached_dev->dev_addr[1] << 8) |
 +                                  phydev->attached_dev->dev_addr[0]));
 +              if (ret < 0)
 +                      return ret;
 +
 +              /* Clear WOL status and enable magic packet matching */
 +              ret = phy_set_bits_mmd(phydev, MDIO_MMD_VEND2,
 +                                     MV_V2_WOL_CTRL,
 +                                     MV_V2_WOL_CTRL_MAGIC_PKT_EN |
 +                                     MV_V2_WOL_CTRL_CLEAR_STS);
 +              if (ret < 0)
 +                      return ret;
 +      } else {
 +              /* Disable magic packet matching & reset WOL status bit */
 +              ret = phy_modify_mmd(phydev, MDIO_MMD_VEND2,
 +                                   MV_V2_WOL_CTRL,
 +                                   MV_V2_WOL_CTRL_MAGIC_PKT_EN,
 +                                   MV_V2_WOL_CTRL_CLEAR_STS);
 +              if (ret < 0)
 +                      return ret;
 +      }
 +
 +      /* Reset the clear WOL status bit as it does not self-clear */
 +      return phy_clear_bits_mmd(phydev, MDIO_MMD_VEND2,
 +                                MV_V2_WOL_CTRL,
 +                                MV_V2_WOL_CTRL_CLEAR_STS);
 +}
 +
  static struct phy_driver mv3310_drivers[] = {
        {
                .phy_id         = MARVELL_PHY_ID_88X3310,
                .set_tunable    = mv3310_set_tunable,
                .remove         = mv3310_remove,
                .set_loopback   = genphy_c45_loopback,
 +              .get_wol        = mv3110_get_wol,
 +              .set_wol        = mv3110_set_wol,
        },
        {
                .phy_id         = MARVELL_PHY_ID_88X3310,
                .set_tunable    = mv3310_set_tunable,
                .remove         = mv3310_remove,
                .set_loopback   = genphy_c45_loopback,
 +              .get_wol        = mv3110_get_wol,
 +              .set_wol        = mv3110_set_wol,
        },
        {
                .phy_id         = MARVELL_PHY_ID_88E2110,
@@@ -47,7 -47,6 +47,7 @@@
  #include <uapi/linux/if_bonding.h>
  #include <uapi/linux/pkt_cls.h>
  #include <linux/hashtable.h>
 +#include <linux/rbtree.h>
  
  struct netpoll_info;
  struct device;
@@@ -209,7 -208,6 +209,7 @@@ struct sk_buff
  
  struct netdev_hw_addr {
        struct list_head        list;
 +      struct rb_node          node;
        unsigned char           addr[MAX_ADDR_LEN];
        unsigned char           type;
  #define NETDEV_HW_ADDR_T_LAN          1
  struct netdev_hw_addr_list {
        struct list_head        list;
        int                     count;
 +
 +      /* Auxiliary tree for faster lookup on addition and deletion */
 +      struct rb_root          tree;
  };
  
  #define netdev_hw_addr_list_count(l) ((l)->count)
@@@ -300,6 -295,18 +300,6 @@@ enum netdev_state_t 
  };
  
  
 -/*
 - * This structure holds boot-time configured netdevice settings. They
 - * are then used in the device probing.
 - */
 -struct netdev_boot_setup {
 -      char name[IFNAMSIZ];
 -      struct ifmap map;
 -};
 -#define NETDEV_BOOT_SETUP_MAX 8
 -
 -int __init netdev_boot_setup(char *str);
 -
  struct gro_list {
        struct list_head        list;
        int                     count;
@@@ -727,13 -734,13 +727,13 @@@ bool rps_may_expire_flow(struct net_dev
  
  /* This structure contains an instance of an RX queue. */
  struct netdev_rx_queue {
 +      struct xdp_rxq_info             xdp_rxq;
  #ifdef CONFIG_RPS
        struct rps_map __rcu            *rps_map;
        struct rps_dev_flow_table __rcu *rps_flow_table;
  #endif
        struct kobject                  kobj;
        struct net_device               *dev;
 -      struct xdp_rxq_info             xdp_rxq;
  #ifdef CONFIG_XDP_SOCKETS
        struct xsk_buff_pool            *pool;
  #endif
@@@ -1079,18 -1086,9 +1079,18 @@@ struct netdev_net_notifier 
   *    Test if Media Access Control address is valid for the device.
   *
   * int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 - *    Called when a user requests an ioctl which can't be handled by
 - *    the generic interface code. If not defined ioctls return
 - *    not supported error code.
 + *    Old-style ioctl entry point. This is used internally by the
 + *    appletalk and ieee802154 subsystems but is no longer called by
 + *    the device ioctl handler.
 + *
 + * int (*ndo_siocbond)(struct net_device *dev, struct ifreq *ifr, int cmd);
 + *    Used by the bonding driver for its device specific ioctls:
 + *    SIOCBONDENSLAVE, SIOCBONDRELEASE, SIOCBONDSETHWADDR, SIOCBONDCHANGEACTIVE,
 + *    SIOCBONDSLAVEINFOQUERY, and SIOCBONDINFOQUERY
 + *
 + * * int (*ndo_eth_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);
 + *    Called for ethernet specific ioctls: SIOCGMIIPHY, SIOCGMIIREG,
 + *    SIOCSMIIREG, SIOCSHWTSTAMP and SIOCGHWTSTAMP.
   *
   * int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);
   *    Used to set network devices bus interface parameters. This interface
   *    that got dropped are freed/returned via xdp_return_frame().
   *    Returns negative number, means general error invoking ndo, meaning
   *    no frames were xmit'ed and core-caller will free all frames.
 + * struct net_device *(*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 + *                                            struct xdp_buff *xdp);
 + *      Get the xmit slave of master device based on the xdp_buff.
   * int (*ndo_xsk_wakeup)(struct net_device *dev, u32 queue_id, u32 flags);
   *      This function is used to wake up the softirq, ksoftirqd or kthread
   *    responsible for sending and/or receiving packets on a specific
@@@ -1366,15 -1361,6 +1366,15 @@@ struct net_device_ops 
        int                     (*ndo_validate_addr)(struct net_device *dev);
        int                     (*ndo_do_ioctl)(struct net_device *dev,
                                                struct ifreq *ifr, int cmd);
 +      int                     (*ndo_eth_ioctl)(struct net_device *dev,
 +                                               struct ifreq *ifr, int cmd);
 +      int                     (*ndo_siocbond)(struct net_device *dev,
 +                                              struct ifreq *ifr, int cmd);
 +      int                     (*ndo_siocwandev)(struct net_device *dev,
 +                                                struct if_settings *ifs);
 +      int                     (*ndo_siocdevprivate)(struct net_device *dev,
 +                                                    struct ifreq *ifr,
 +                                                    void __user *data, int cmd);
        int                     (*ndo_set_config)(struct net_device *dev,
                                                  struct ifmap *map);
        int                     (*ndo_change_mtu)(struct net_device *dev,
        int                     (*ndo_xdp_xmit)(struct net_device *dev, int n,
                                                struct xdp_frame **xdp,
                                                u32 flags);
 +      struct net_device *     (*ndo_xdp_get_xmit_slave)(struct net_device *dev,
 +                                                        struct xdp_buff *xdp);
        int                     (*ndo_xsk_wakeup)(struct net_device *dev,
                                                  u32 queue_id, u32 flags);
        struct devlink_port *   (*ndo_get_devlink_port)(struct net_device *dev);
@@@ -1821,7 -1805,6 +1821,7 @@@ enum netdev_ml_priv_type 
   *    @ieee802154_ptr: IEEE 802.15.4 low-rate Wireless Personal Area Network
   *                     device struct
   *    @mpls_ptr:      mpls_dev struct pointer
 + *    @mctp_ptr:      MCTP specific data
   *
   *    @dev_addr:      Hw address (before bcast,
   *                    because most packets are unicast)
@@@ -2109,9 -2092,6 +2109,9 @@@ struct net_device 
  #if IS_ENABLED(CONFIG_MPLS_ROUTING)
        struct mpls_dev __rcu   *mpls_ptr;
  #endif
 +#if IS_ENABLED(CONFIG_MCTP)
 +      struct mctp_dev __rcu   *mctp_ptr;
 +#endif
  
  /*
   * Cache lines mostly used on receive path (including eth_type_trans())
@@@ -2937,6 -2917,7 +2937,6 @@@ static inline struct net_device *first_
  }
  
  int netdev_boot_setup_check(struct net_device *dev);
 -unsigned long netdev_boot_base(const char *prefix, int unit);
  struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
                                       const char *hwaddr);
  struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type);
@@@ -3308,6 -3289,14 +3308,6 @@@ static inline bool dev_has_header(cons
        return dev->header_ops && dev->header_ops->create;
  }
  
 -typedef int gifconf_func_t(struct net_device * dev, char __user * bufptr,
 -                         int len, int size);
 -int register_gifconf(unsigned int family, gifconf_func_t *gifconf);
 -static inline int unregister_gifconf(unsigned int family)
 -{
 -      return register_gifconf(family, NULL);
 -}
 -
  #ifdef CONFIG_NET_FLOW_LIMIT
  #define FLOW_LIMIT_HISTORY    (1 << 7)  /* must be ^2 and !overflow buckets */
  struct sd_flow_limit {
@@@ -3926,8 -3915,6 +3926,8 @@@ static inline int netif_set_real_num_rx
        return 0;
  }
  #endif
 +int netif_set_real_num_queues(struct net_device *dev,
 +                            unsigned int txq, unsigned int rxq);
  
  static inline struct netdev_rx_queue *
  __netif_get_rx_queue(struct net_device *dev, unsigned int rxq)
@@@ -3961,7 -3948,7 +3961,7 @@@ void __dev_kfree_skb_any(struct sk_buf
  /*
   * It is not allowed to call kfree_skb() or consume_skb() from hardware
   * interrupt context or with hardware interrupts being disabled.
 - * (in_irq() || irqs_disabled())
 + * (in_hardirq() || irqs_disabled())
   *
   * We provide four helpers that can be used in following contexts :
   *
@@@ -3997,8 -3984,6 +3997,8 @@@ static inline void dev_consume_skb_any(
        __dev_kfree_skb_any(skb, SKB_REASON_CONSUMED);
  }
  
 +u32 bpf_prog_run_generic_xdp(struct sk_buff *skb, struct xdp_buff *xdp,
 +                           struct bpf_prog *xdp_prog);
  void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog);
  int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb);
  int netif_rx(struct sk_buff *skb);
@@@ -4027,12 -4012,14 +4027,16 @@@ int netdev_rx_handler_register(struct n
  void netdev_rx_handler_unregister(struct net_device *dev);
  
  bool dev_valid_name(const char *name);
+ static inline bool is_socket_ioctl_cmd(unsigned int cmd)
+ {
+       return _IOC_TYPE(cmd) == SOCK_IOC_TYPE;
+ }
 +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg);
 +int put_user_ifreq(struct ifreq *ifr, void __user *arg);
  int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr,
 -              bool *need_copyout);
 -int dev_ifconf(struct net *net, struct ifconf *, int);
 -int dev_ethtool(struct net *net, struct ifreq *);
 +              void __user *data, bool *need_copyout);
 +int dev_ifconf(struct net *net, struct ifconf __user *ifc);
 +int dev_ethtool(struct net *net, struct ifreq *ifr, void __user *userdata);
  unsigned int dev_get_flags(const struct net_device *);
  int __dev_change_flags(struct net_device *dev, unsigned int flags,
                       struct netlink_ext_ack *extack);
@@@ -4086,7 -4073,6 +4090,7 @@@ typedef int (*bpf_op_t)(struct net_devi
  int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
                      int fd, int expected_fd, u32 flags);
  int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 +u8 dev_xdp_prog_count(struct net_device *dev);
  u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
  
  int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
@@@ -4154,13 -4140,11 +4158,13 @@@ void netdev_run_todo(void)
   */
  static inline void dev_put(struct net_device *dev)
  {
 +      if (dev) {
  #ifdef CONFIG_PCPU_DEV_REFCNT
 -      this_cpu_dec(*dev->pcpu_refcnt);
 +              this_cpu_dec(*dev->pcpu_refcnt);
  #else
 -      refcount_dec(&dev->dev_refcnt);
 +              refcount_dec(&dev->dev_refcnt);
  #endif
 +      }
  }
  
  /**
   */
  static inline void dev_hold(struct net_device *dev)
  {
 +      if (dev) {
  #ifdef CONFIG_PCPU_DEV_REFCNT
 -      this_cpu_inc(*dev->pcpu_refcnt);
 +              this_cpu_inc(*dev->pcpu_refcnt);
  #else
 -      refcount_inc(&dev->dev_refcnt);
 +              refcount_inc(&dev->dev_refcnt);
  #endif
 +      }
  }
  
  /* Carrier loss detection, dial on demand. The functions netif_carrier_on
diff --combined net/socket.c
@@@ -212,7 -212,6 +212,7 @@@ static const char * const pf_family_nam
        [PF_QIPCRTR]    = "PF_QIPCRTR",
        [PF_SMC]        = "PF_SMC",
        [PF_XDP]        = "PF_XDP",
 +      [PF_MCTP]       = "PF_MCTP",
  };
  
  /*
@@@ -1065,13 -1064,9 +1065,13 @@@ static ssize_t sock_write_iter(struct k
   */
  
  static DEFINE_MUTEX(br_ioctl_mutex);
 -static int (*br_ioctl_hook) (struct net *, unsigned int cmd, void __user *arg);
 +static int (*br_ioctl_hook)(struct net *net, struct net_bridge *br,
 +                          unsigned int cmd, struct ifreq *ifr,
 +                          void __user *uarg);
  
 -void brioctl_set(int (*hook) (struct net *, unsigned int, void __user *))
 +void brioctl_set(int (*hook)(struct net *net, struct net_bridge *br,
 +                           unsigned int cmd, struct ifreq *ifr,
 +                           void __user *uarg))
  {
        mutex_lock(&br_ioctl_mutex);
        br_ioctl_hook = hook;
  }
  EXPORT_SYMBOL(brioctl_set);
  
 +int br_ioctl_call(struct net *net, struct net_bridge *br, unsigned int cmd,
 +                struct ifreq *ifr, void __user *uarg)
 +{
 +      int err = -ENOPKG;
 +
 +      if (!br_ioctl_hook)
 +              request_module("bridge");
 +
 +      mutex_lock(&br_ioctl_mutex);
 +      if (br_ioctl_hook)
 +              err = br_ioctl_hook(net, br, cmd, ifr, uarg);
 +      mutex_unlock(&br_ioctl_mutex);
 +
 +      return err;
 +}
 +
  static DEFINE_MUTEX(vlan_ioctl_mutex);
  static int (*vlan_ioctl_hook) (struct net *, void __user *arg);
  
@@@ -1109,11 -1088,8 +1109,11 @@@ EXPORT_SYMBOL(vlan_ioctl_set)
  static long sock_do_ioctl(struct net *net, struct socket *sock,
                          unsigned int cmd, unsigned long arg)
  {
 +      struct ifreq ifr;
 +      bool need_copyout;
        int err;
        void __user *argp = (void __user *)arg;
 +      void __user *data;
  
        err = sock->ops->ioctl(sock, cmd, arg);
  
        if (err != -ENOIOCTLCMD)
                return err;
  
 -      if (cmd == SIOCGIFCONF) {
 -              struct ifconf ifc;
 -              if (copy_from_user(&ifc, argp, sizeof(struct ifconf)))
 -                      return -EFAULT;
 -              rtnl_lock();
 -              err = dev_ifconf(net, &ifc, sizeof(struct ifreq));
 -              rtnl_unlock();
 -              if (!err && copy_to_user(argp, &ifc, sizeof(struct ifconf)))
 -                      err = -EFAULT;
 -      } else if (is_socket_ioctl_cmd(cmd)) {
 -              struct ifreq ifr;
 -              bool need_copyout;
 -              if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
++      if (!is_socket_ioctl_cmd(cmd))
++              return -ENOTTY;
++
 +      if (get_user_ifreq(&ifr, &data, argp))
 +              return -EFAULT;
 +      err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
 +      if (!err && need_copyout)
 +              if (put_user_ifreq(&ifr, argp))
                        return -EFAULT;
 -              err = dev_ioctl(net, cmd, &ifr, &need_copyout);
 -              if (!err && need_copyout)
 -                      if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
 -                              return -EFAULT;
 -      } else {
 -              err = -ENOTTY;
 -      }
 +
        return err;
  }
  
@@@ -1152,13 -1142,12 +1155,13 @@@ static long sock_ioctl(struct file *fil
        net = sock_net(sk);
        if (unlikely(cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))) {
                struct ifreq ifr;
 +              void __user *data;
                bool need_copyout;
 -              if (copy_from_user(&ifr, argp, sizeof(struct ifreq)))
 +              if (get_user_ifreq(&ifr, &data, argp))
                        return -EFAULT;
 -              err = dev_ioctl(net, cmd, &ifr, &need_copyout);
 +              err = dev_ioctl(net, cmd, &ifr, data, &need_copyout);
                if (!err && need_copyout)
 -                      if (copy_to_user(argp, &ifr, sizeof(struct ifreq)))
 +                      if (put_user_ifreq(&ifr, argp))
                                return -EFAULT;
        } else
  #ifdef CONFIG_WEXT_CORE
                case SIOCSIFBR:
                case SIOCBRADDBR:
                case SIOCBRDELBR:
 -                      err = -ENOPKG;
 -                      if (!br_ioctl_hook)
 -                              request_module("bridge");
 -
 -                      mutex_lock(&br_ioctl_mutex);
 -                      if (br_ioctl_hook)
 -                              err = br_ioctl_hook(net, cmd, argp);
 -                      mutex_unlock(&br_ioctl_mutex);
 +                      err = br_ioctl_call(net, NULL, cmd, NULL, argp);
                        break;
                case SIOCGIFVLAN:
                case SIOCSIFVLAN:
                                                   cmd == SIOCGSTAMP_NEW,
                                                   false);
                        break;
 +
 +              case SIOCGIFCONF:
 +                      err = dev_ifconf(net, argp);
 +                      break;
 +
                default:
                        err = sock_do_ioctl(net, sock, cmd, arg);
                        break;
@@@ -3137,55 -3128,154 +3140,55 @@@ void socket_seq_show(struct seq_file *s
  }
  #endif                                /* CONFIG_PROC_FS */
  
 -#ifdef CONFIG_COMPAT
 -static int compat_dev_ifconf(struct net *net, struct compat_ifconf __user *uifc32)
 +/* Handle the fact that while struct ifreq has the same *layout* on
 + * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
 + * which are handled elsewhere, it still has different *size* due to
 + * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
 + * resulting in struct ifreq being 32 and 40 bytes respectively).
 + * As a result, if the struct happens to be at the end of a page and
 + * the next page isn't readable/writable, we get a fault. To prevent
 + * that, copy back and forth to the full size.
 + */
 +int get_user_ifreq(struct ifreq *ifr, void __user **ifrdata, void __user *arg)
  {
 -      struct compat_ifconf ifc32;
 -      struct ifconf ifc;
 -      int err;
 +      if (in_compat_syscall()) {
 +              struct compat_ifreq *ifr32 = (struct compat_ifreq *)ifr;
  
 -      if (copy_from_user(&ifc32, uifc32, sizeof(struct compat_ifconf)))
 -              return -EFAULT;
 +              memset(ifr, 0, sizeof(*ifr));
 +              if (copy_from_user(ifr32, arg, sizeof(*ifr32)))
 +                      return -EFAULT;
  
 -      ifc.ifc_len = ifc32.ifc_len;
 -      ifc.ifc_req = compat_ptr(ifc32.ifcbuf);
 +              if (ifrdata)
 +                      *ifrdata = compat_ptr(ifr32->ifr_data);
  
 -      rtnl_lock();
 -      err = dev_ifconf(net, &ifc, sizeof(struct compat_ifreq));
 -      rtnl_unlock();
 -      if (err)
 -              return err;
 +              return 0;
 +      }
  
 -      ifc32.ifc_len = ifc.ifc_len;
 -      if (copy_to_user(uifc32, &ifc32, sizeof(struct compat_ifconf)))
 +      if (copy_from_user(ifr, arg, sizeof(*ifr)))
                return -EFAULT;
  
 +      if (ifrdata)
 +              *ifrdata = ifr->ifr_data;
 +
        return 0;
  }
 +EXPORT_SYMBOL(get_user_ifreq);
  
 -static int ethtool_ioctl(struct net *net, struct compat_ifreq __user *ifr32)
 +int put_user_ifreq(struct ifreq *ifr, void __user *arg)
  {
 -      struct compat_ethtool_rxnfc __user *compat_rxnfc;
 -      bool convert_in = false, convert_out = false;
 -      size_t buf_size = 0;
 -      struct ethtool_rxnfc __user *rxnfc = NULL;
 -      struct ifreq ifr;
 -      u32 rule_cnt = 0, actual_rule_cnt;
 -      u32 ethcmd;
 -      u32 data;
 -      int ret;
 +      size_t size = sizeof(*ifr);
  
 -      if (get_user(data, &ifr32->ifr_ifru.ifru_data))
 -              return -EFAULT;
 -
 -      compat_rxnfc = compat_ptr(data);
 +      if (in_compat_syscall())
 +              size = sizeof(struct compat_ifreq);
  
 -      if (get_user(ethcmd, &compat_rxnfc->cmd))
 +      if (copy_to_user(arg, ifr, size))
                return -EFAULT;
  
 -      /* Most ethtool structures are defined without padding.
 -       * Unfortunately struct ethtool_rxnfc is an exception.
 -       */
 -      switch (ethcmd) {
 -      default:
 -              break;
 -      case ETHTOOL_GRXCLSRLALL:
 -              /* Buffer size is variable */
 -              if (get_user(rule_cnt, &compat_rxnfc->rule_cnt))
 -                      return -EFAULT;
 -              if (rule_cnt > KMALLOC_MAX_SIZE / sizeof(u32))
 -                      return -ENOMEM;
 -              buf_size += rule_cnt * sizeof(u32);
 -              fallthrough;
 -      case ETHTOOL_GRXRINGS:
 -      case ETHTOOL_GRXCLSRLCNT:
 -      case ETHTOOL_GRXCLSRULE:
 -      case ETHTOOL_SRXCLSRLINS:
 -              convert_out = true;
 -              fallthrough;
 -      case ETHTOOL_SRXCLSRLDEL:
 -              buf_size += sizeof(struct ethtool_rxnfc);
 -              convert_in = true;
 -              rxnfc = compat_alloc_user_space(buf_size);
 -              break;
 -      }
 -
 -      if (copy_from_user(&ifr.ifr_name, &ifr32->ifr_name, IFNAMSIZ))
 -              return -EFAULT;
 -
 -      ifr.ifr_data = convert_in ? rxnfc : (void __user *)compat_rxnfc;
 -
 -      if (convert_in) {
 -              /* We expect there to be holes between fs.m_ext and
 -               * fs.ring_cookie and at the end of fs, but nowhere else.
 -               */
 -              BUILD_BUG_ON(offsetof(struct compat_ethtool_rxnfc, fs.m_ext) +
 -                           sizeof(compat_rxnfc->fs.m_ext) !=
 -                           offsetof(struct ethtool_rxnfc, fs.m_ext) +
 -                           sizeof(rxnfc->fs.m_ext));
 -              BUILD_BUG_ON(
 -                      offsetof(struct compat_ethtool_rxnfc, fs.location) -
 -                      offsetof(struct compat_ethtool_rxnfc, fs.ring_cookie) !=
 -                      offsetof(struct ethtool_rxnfc, fs.location) -
 -                      offsetof(struct ethtool_rxnfc, fs.ring_cookie));
 -
 -              if (copy_in_user(rxnfc, compat_rxnfc,
 -                               (void __user *)(&rxnfc->fs.m_ext + 1) -
 -                               (void __user *)rxnfc) ||
 -                  copy_in_user(&rxnfc->fs.ring_cookie,
 -                               &compat_rxnfc->fs.ring_cookie,
 -                               (void __user *)(&rxnfc->fs.location + 1) -
 -                               (void __user *)&rxnfc->fs.ring_cookie))
 -                      return -EFAULT;
 -              if (ethcmd == ETHTOOL_GRXCLSRLALL) {
 -                      if (put_user(rule_cnt, &rxnfc->rule_cnt))
 -                              return -EFAULT;
 -              } else if (copy_in_user(&rxnfc->rule_cnt,
 -                                      &compat_rxnfc->rule_cnt,
 -                                      sizeof(rxnfc->rule_cnt)))
 -                      return -EFAULT;
 -      }
 -
 -      ret = dev_ioctl(net, SIOCETHTOOL, &ifr, NULL);
 -      if (ret)
 -              return ret;
 -
 -      if (convert_out) {
 -              if (copy_in_user(compat_rxnfc, rxnfc,
 -                               (const void __user *)(&rxnfc->fs.m_ext + 1) -
 -                               (const void __user *)rxnfc) ||
 -                  copy_in_user(&compat_rxnfc->fs.ring_cookie,
 -                               &rxnfc->fs.ring_cookie,
 -                               (const void __user *)(&rxnfc->fs.location + 1) -
 -                               (const void __user *)&rxnfc->fs.ring_cookie) ||
 -                  copy_in_user(&compat_rxnfc->rule_cnt, &rxnfc->rule_cnt,
 -                               sizeof(rxnfc->rule_cnt)))
 -                      return -EFAULT;
 -
 -              if (ethcmd == ETHTOOL_GRXCLSRLALL) {
 -                      /* As an optimisation, we only copy the actual
 -                       * number of rules that the underlying
 -                       * function returned.  Since Mallory might
 -                       * change the rule count in user memory, we
 -                       * check that it is less than the rule count
 -                       * originally given (as the user buffer size),
 -                       * which has been range-checked.
 -                       */
 -                      if (get_user(actual_rule_cnt, &rxnfc->rule_cnt))
 -                              return -EFAULT;
 -                      if (actual_rule_cnt < rule_cnt)
 -                              rule_cnt = actual_rule_cnt;
 -                      if (copy_in_user(&compat_rxnfc->rule_locs[0],
 -                                       &rxnfc->rule_locs[0],
 -                                       rule_cnt * sizeof(u32)))
 -                              return -EFAULT;
 -              }
 -      }
 -
        return 0;
  }
 +EXPORT_SYMBOL(put_user_ifreq);
  
 +#ifdef CONFIG_COMPAT
  static int compat_siocwandev(struct net *net, struct compat_ifreq __user *uifr32)
  {
        compat_uptr_t uptr32;
        void __user *saved;
        int err;
  
 -      if (copy_from_user(&ifr, uifr32, sizeof(struct compat_ifreq)))
 +      if (get_user_ifreq(&ifr, NULL, uifr32))
                return -EFAULT;
  
        if (get_user(uptr32, &uifr32->ifr_settings.ifs_ifsu))
        saved = ifr.ifr_settings.ifs_ifsu.raw_hdlc;
        ifr.ifr_settings.ifs_ifsu.raw_hdlc = compat_ptr(uptr32);
  
 -      err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL);
 +      err = dev_ioctl(net, SIOCWANDEV, &ifr, NULL, NULL);
        if (!err) {
                ifr.ifr_settings.ifs_ifsu.raw_hdlc = saved;
 -              if (copy_to_user(uifr32, &ifr, sizeof(struct compat_ifreq)))
 +              if (put_user_ifreq(&ifr, uifr32))
                        err = -EFAULT;
        }
        return err;
@@@ -3216,13 -3306,99 +3219,15 @@@ static int compat_ifr_data_ioctl(struc
                                 struct compat_ifreq __user *u_ifreq32)
  {
        struct ifreq ifreq;
 -      u32 data32;
 +      void __user *data;
  
 -      if (copy_from_user(ifreq.ifr_name, u_ifreq32->ifr_name, IFNAMSIZ))
 -              return -EFAULT;
 -      if (get_user(data32, &u_ifreq32->ifr_data))
 -              return -EFAULT;
 -      ifreq.ifr_data = compat_ptr(data32);
 -
 -      return dev_ioctl(net, cmd, &ifreq, NULL);
 -}
 -
 -static int compat_ifreq_ioctl(struct net *net, struct socket *sock,
 -                            unsigned int cmd,
 -                            struct compat_ifreq __user *uifr32)
 -{
 -      struct ifreq __user *uifr;
 -      int err;
 -
 -      /* Handle the fact that while struct ifreq has the same *layout* on
 -       * 32/64 for everything but ifreq::ifru_ifmap and ifreq::ifru_data,
 -       * which are handled elsewhere, it still has different *size* due to
 -       * ifreq::ifru_ifmap (which is 16 bytes on 32 bit, 24 bytes on 64-bit,
 -       * resulting in struct ifreq being 32 and 40 bytes respectively).
 -       * As a result, if the struct happens to be at the end of a page and
 -       * the next page isn't readable/writable, we get a fault. To prevent
 -       * that, copy back and forth to the full size.
 -       */
 -
 -      uifr = compat_alloc_user_space(sizeof(*uifr));
 -      if (copy_in_user(uifr, uifr32, sizeof(*uifr32)))
 -              return -EFAULT;
 -
 -      err = sock_do_ioctl(net, sock, cmd, (unsigned long)uifr);
 -
 -      if (!err) {
 -              switch (cmd) {
 -              case SIOCGIFFLAGS:
 -              case SIOCGIFMETRIC:
 -              case SIOCGIFMTU:
 -              case SIOCGIFMEM:
 -              case SIOCGIFHWADDR:
 -              case SIOCGIFINDEX:
 -              case SIOCGIFADDR:
 -              case SIOCGIFBRDADDR:
 -              case SIOCGIFDSTADDR:
 -              case SIOCGIFNETMASK:
 -              case SIOCGIFPFLAGS:
 -              case SIOCGIFTXQLEN:
 -              case SIOCGMIIPHY:
 -              case SIOCGMIIREG:
 -              case SIOCGIFNAME:
 -                      if (copy_in_user(uifr32, uifr, sizeof(*uifr32)))
 -                              err = -EFAULT;
 -                      break;
 -              }
 -      }
 -      return err;
 -}
 -
 -static int compat_sioc_ifmap(struct net *net, unsigned int cmd,
 -                      struct compat_ifreq __user *uifr32)
 -{
 -      struct ifreq ifr;
 -      struct compat_ifmap __user *uifmap32;
 -      int err;
 -
 -      uifmap32 = &uifr32->ifr_ifru.ifru_map;
 -      err = copy_from_user(&ifr, uifr32, sizeof(ifr.ifr_name));
 -      err |= get_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
 -      err |= get_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
 -      err |= get_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
 -      err |= get_user(ifr.ifr_map.irq, &uifmap32->irq);
 -      err |= get_user(ifr.ifr_map.dma, &uifmap32->dma);
 -      err |= get_user(ifr.ifr_map.port, &uifmap32->port);
 -      if (err)
+       if (!is_socket_ioctl_cmd(cmd))
+               return -ENOTTY;
 +      if (get_user_ifreq(&ifreq, &data, u_ifreq32))
                return -EFAULT;
 +      ifreq.ifr_data = data;
  
 -      err = dev_ioctl(net, cmd, &ifr, NULL);
 -
 -      if (cmd == SIOCGIFMAP && !err) {
 -              err = copy_to_user(uifr32, &ifr, sizeof(ifr.ifr_name));
 -              err |= put_user(ifr.ifr_map.mem_start, &uifmap32->mem_start);
 -              err |= put_user(ifr.ifr_map.mem_end, &uifmap32->mem_end);
 -              err |= put_user(ifr.ifr_map.base_addr, &uifmap32->base_addr);
 -              err |= put_user(ifr.ifr_map.irq, &uifmap32->irq);
 -              err |= put_user(ifr.ifr_map.dma, &uifmap32->dma);
 -              err |= put_user(ifr.ifr_map.port, &uifmap32->port);
 -              if (err)
 -                      err = -EFAULT;
 -      }
 -      return err;
 +      return dev_ioctl(net, cmd, &ifreq, data, NULL);
  }
  
  /* Since old style bridge ioctl's endup using SIOCDEVPRIVATE
@@@ -3248,14 -3424,21 +3253,14 @@@ static int compat_sock_ioctl_trans(stru
        struct net *net = sock_net(sk);
  
        if (cmd >= SIOCDEVPRIVATE && cmd <= (SIOCDEVPRIVATE + 15))
 -              return compat_ifr_data_ioctl(net, cmd, argp);
 +              return sock_ioctl(file, cmd, (unsigned long)argp);
  
        switch (cmd) {
        case SIOCSIFBR:
        case SIOCGIFBR:
                return old_bridge_ioctl(argp);
 -      case SIOCGIFCONF:
 -              return compat_dev_ifconf(net, argp);
 -      case SIOCETHTOOL:
 -              return ethtool_ioctl(net, argp);
        case SIOCWANDEV:
                return compat_siocwandev(net, argp);
 -      case SIOCGIFMAP:
 -      case SIOCSIFMAP:
 -              return compat_sioc_ifmap(net, cmd, argp);
        case SIOCGSTAMP_OLD:
        case SIOCGSTAMPNS_OLD:
                if (!sock->ops->gettstamp)
                return sock->ops->gettstamp(sock, argp, cmd == SIOCGSTAMP_OLD,
                                            !COMPAT_USE_64BIT_TIME);
  
 +      case SIOCETHTOOL:
        case SIOCBONDSLAVEINFOQUERY:
        case SIOCBONDINFOQUERY:
        case SIOCSHWTSTAMP:
        case SIOCGSKNS:
        case SIOCGSTAMP_NEW:
        case SIOCGSTAMPNS_NEW:
 +      case SIOCGIFCONF:
                return sock_ioctl(file, cmd, arg);
  
        case SIOCGIFFLAGS:
        case SIOCSIFFLAGS:
 +      case SIOCGIFMAP:
 +      case SIOCSIFMAP:
        case SIOCGIFMETRIC:
        case SIOCSIFMETRIC:
        case SIOCGIFMTU:
        case SIOCBONDRELEASE:
        case SIOCBONDSETHWADDR:
        case SIOCBONDCHANGEACTIVE:
 -              return compat_ifreq_ioctl(net, sock, cmd, argp);
 -
        case SIOCSARP:
        case SIOCGARP:
        case SIOCDARP: