Merge tag 'v5.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux...
authorSaeed Mahameed <saeedm@mellanox.com>
Mon, 22 Apr 2019 22:25:39 +0000 (15:25 -0700)
committerSaeed Mahameed <saeedm@mellanox.com>
Mon, 22 Apr 2019 22:25:39 +0000 (15:25 -0700)
Linux 5.1-rc1

We forgot to reset the branch last merge window thus mlx5-next is outdated
and still based on 5.0-rc2. This merge commit is needed to sync mlx5-next
branch with 5.1-rc1.

Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
13 files changed:
1  2 
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/qp.c
drivers/net/ethernet/mellanox/mlx5/core/cmd.c
drivers/net/ethernet/mellanox/mlx5/core/en.h
drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c
drivers/net/ethernet/mellanox/mlx5/core/fs_core.c
drivers/net/ethernet/mellanox/mlx5/core/health.c
drivers/net/ethernet/mellanox/mlx5/core/main.c
drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h
drivers/net/ethernet/mellanox/mlx5/core/uar.c
include/linux/mlx5/driver.h
include/linux/mlx5/mlx5_ifc.h

@@@ -535,24 -535,51 +535,51 @@@ out
        return err;
  }
  
+ struct mlx5_ib_vlan_info {
+       u16 vlan_id;
+       bool vlan;
+ };
+ static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
+ {
+       struct mlx5_ib_vlan_info *vlan_info = data;
+       if (is_vlan_dev(lower_dev)) {
+               vlan_info->vlan = true;
+               vlan_info->vlan_id = vlan_dev_vlan_id(lower_dev);
+       }
+       /* We are interested only in first level vlan device, so
+        * always return 1 to stop iterating over next level devices.
+        */
+       return 1;
+ }
  static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
                         unsigned int index, const union ib_gid *gid,
                         const struct ib_gid_attr *attr)
  {
        enum ib_gid_type gid_type = IB_GID_TYPE_IB;
+       struct mlx5_ib_vlan_info vlan_info = { };
        u8 roce_version = 0;
        u8 roce_l3_type = 0;
-       bool vlan = false;
        u8 mac[ETH_ALEN];
-       u16 vlan_id = 0;
  
        if (gid) {
                gid_type = attr->gid_type;
                ether_addr_copy(mac, attr->ndev->dev_addr);
  
                if (is_vlan_dev(attr->ndev)) {
-                       vlan = true;
-                       vlan_id = vlan_dev_vlan_id(attr->ndev);
+                       vlan_info.vlan = true;
+                       vlan_info.vlan_id = vlan_dev_vlan_id(attr->ndev);
+               } else {
+                       /* If the netdev is upper device and if it's lower
+                        * lower device is vlan device, consider vlan id of
+                        * the lower vlan device for this gid entry.
+                        */
+                       rcu_read_lock();
+                       netdev_walk_all_lower_dev_rcu(attr->ndev,
+                                       get_lower_dev_vlan, &vlan_info);
+                       rcu_read_unlock();
                }
        }
  
        }
  
        return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
-                                     roce_l3_type, gid->raw, mac, vlan,
-                                     vlan_id, port_num);
+                                     roce_l3_type, gid->raw, mac,
+                                     vlan_info.vlan, vlan_info.vlan_id,
+                                     port_num);
  }
  
  static int mlx5_ib_add_gid(const struct ib_gid_attr *attr,
@@@ -982,11 -1010,11 +1010,11 @@@ static int mlx5_ib_query_device(struct 
        props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz);
        props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL;
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       if (MLX5_CAP_GEN(mdev, pg))
-               props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
-       props->odp_caps = dev->odp_caps;
- #endif
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+               if (MLX5_CAP_GEN(mdev, pg))
+                       props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+               props->odp_caps = dev->odp_caps;
+       }
  
        if (MLX5_CAP_GEN(mdev, cd))
                props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL;
@@@ -1717,14 -1745,15 +1745,15 @@@ static void mlx5_ib_dealloc_transport_d
        mlx5_ib_disable_lb(dev, true, false);
  }
  
- static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
-                                                 struct ib_udata *udata)
+ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
+                                 struct ib_udata *udata)
  {
+       struct ib_device *ibdev = uctx->device;
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_alloc_ucontext_req_v2 req = {};
        struct mlx5_ib_alloc_ucontext_resp resp = {};
        struct mlx5_core_dev *mdev = dev->mdev;
-       struct mlx5_ib_ucontext *context;
+       struct mlx5_ib_ucontext *context = to_mucontext(uctx);
        struct mlx5_bfreg_info *bfregi;
        int ver;
        int err;
        bool lib_uar_4k;
  
        if (!dev->ib_active)
-               return ERR_PTR(-EAGAIN);
+               return -EAGAIN;
  
        if (udata->inlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
                ver = 0;
        else if (udata->inlen >= min_req_v2)
                ver = 2;
        else
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
        err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
        if (err)
-               return ERR_PTR(err);
+               return err;
  
        if (req.flags & ~MLX5_IB_ALLOC_UCTX_DEVX)
-               return ERR_PTR(-EOPNOTSUPP);
+               return -EOPNOTSUPP;
  
        if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2)
-               return ERR_PTR(-EOPNOTSUPP);
+               return -EOPNOTSUPP;
  
        req.total_num_bfregs = ALIGN(req.total_num_bfregs,
                                    MLX5_NON_FP_BFREGS_PER_UAR);
        if (req.num_low_latency_bfregs > req.total_num_bfregs - 1)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
  
        resp.qp_tab_size = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp);
        if (mlx5_core_is_pf(dev->mdev) && MLX5_CAP_GEN(dev->mdev, bf))
                /* MLX5_USER_ALLOC_UCONTEXT_FLOW_ACTION_FLAGS_ESP_AES_GCM_FULL_OFFLOAD is currently always 0 */
        }
  
-       context = kzalloc(sizeof(*context), GFP_KERNEL);
-       if (!context)
-               return ERR_PTR(-ENOMEM);
        lib_uar_4k = req.lib_caps & MLX5_LIB_CAP_4K_UAR;
        bfregi = &context->bfregi;
  
        if (err)
                goto out_sys_pages;
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
- #endif
+       if (ibdev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)
+               context->ibucontext.invalidate_range =
+                       &mlx5_ib_invalidate_range;
  
        if (req.flags & MLX5_IB_ALLOC_UCTX_DEVX) {
                err = mlx5_ib_devx_create(dev, true);
                                   1, &dev->roce[port].tx_port_affinity));
        }
  
-       return &context->ibucontext;
+       return 0;
  
  out_mdev:
        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
@@@ -1945,23 -1970,19 +1970,19 @@@ out_count
        kfree(bfregi->count);
  
  out_ctx:
-       kfree(context);
-       return ERR_PTR(err);
+       return err;
  }
  
- static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+ static void mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
  {
        struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
        struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
        struct mlx5_bfreg_info *bfregi;
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        /* All umem's must be destroyed before destroying the ucontext. */
        mutex_lock(&ibcontext->per_mm_list_lock);
        WARN_ON(!list_empty(&ibcontext->per_mm_list));
        mutex_unlock(&ibcontext->per_mm_list_lock);
- #endif
  
        bfregi = &context->bfregi;
        mlx5_ib_dealloc_transport_domain(dev, context->tdn, context->devx_uid);
        deallocate_uars(dev, context);
        kfree(bfregi->sys_pages);
        kfree(bfregi->count);
-       kfree(context);
-       return 0;
  }
  
  static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev,
  
        fw_uars_per_page = MLX5_CAP_GEN(dev->mdev, uar_4k) ? MLX5_UARS_IN_PAGE : 1;
  
 -      return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
 +      return (dev->mdev->bar_addr >> PAGE_SHIFT) + uar_idx / fw_uars_per_page;
  }
  
  static int get_command(unsigned long offset)
@@@ -2174,7 -2192,7 +2192,7 @@@ static int dm_mmap(struct ib_ucontext *
            page_idx + npages)
                return -EINVAL;
  
 -      pfn = ((pci_resource_start(dev->mdev->pdev, 0) +
 +      pfn = ((dev->mdev->bar_addr +
              MLX5_CAP64_DEV_MEM(dev->mdev, memic_bar_start_addr)) >>
              PAGE_SHIFT) +
              page_idx;
@@@ -2258,7 -2276,7 +2276,7 @@@ struct ib_dm *mlx5_ib_alloc_dm(struct i
                goto err_free;
  
        start_offset = memic_addr & ~PAGE_MASK;
 -      page_idx = (memic_addr - pci_resource_start(memic->dev->pdev, 0) -
 +      page_idx = (memic_addr - memic->dev->bar_addr -
                    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
                    PAGE_SHIFT;
  
@@@ -2301,7 -2319,7 +2319,7 @@@ int mlx5_ib_dealloc_dm(struct ib_dm *ib
        if (ret)
                return ret;
  
 -      page_idx = (dm->dev_addr - pci_resource_start(memic->dev->pdev, 0) -
 +      page_idx = (dm->dev_addr - memic->dev->bar_addr -
                    MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
                    PAGE_SHIFT;
        bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
        return 0;
  }
  
- static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
-                                     struct ib_ucontext *context,
-                                     struct ib_udata *udata)
+ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
+                           struct ib_udata *udata)
  {
+       struct mlx5_ib_pd *pd = to_mpd(ibpd);
+       struct ib_device *ibdev = ibpd->device;
        struct mlx5_ib_alloc_pd_resp resp;
-       struct mlx5_ib_pd *pd;
        int err;
        u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
        u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
        u16 uid = 0;
  
-       pd = kmalloc(sizeof(*pd), GFP_KERNEL);
-       if (!pd)
-               return ERR_PTR(-ENOMEM);
        uid = context ? to_mucontext(context)->devx_uid : 0;
        MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
        MLX5_SET(alloc_pd_in, in, uid, uid);
        err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
                            out, sizeof(out));
-       if (err) {
-               kfree(pd);
-               return ERR_PTR(err);
-       }
+       if (err)
+               return err;
  
        pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
        pd->uid = uid;
                resp.pdn = pd->pdn;
                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
                        mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
-                       kfree(pd);
-                       return ERR_PTR(-EFAULT);
+                       return -EFAULT;
                }
        }
  
-       return &pd->ibpd;
+       return 0;
  }
  
- static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+ static void mlx5_ib_dealloc_pd(struct ib_pd *pd)
  {
        struct mlx5_ib_dev *mdev = to_mdev(pd->device);
        struct mlx5_ib_pd *mpd = to_mpd(pd);
  
        mlx5_cmd_dealloc_pd(mdev->mdev, mpd->pdn, mpd->uid);
-       kfree(mpd);
-       return 0;
  }
  
  enum {
@@@ -2394,10 -2402,29 +2402,29 @@@ static u8 get_match_criteria_enable(u3
        return match_criteria_enable;
  }
  
- static void set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
+ static int set_proto(void *outer_c, void *outer_v, u8 mask, u8 val)
  {
-       MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
-       MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+       u8 entry_mask;
+       u8 entry_val;
+       int err = 0;
+       if (!mask)
+               goto out;
+       entry_mask = MLX5_GET(fte_match_set_lyr_2_4, outer_c,
+                             ip_protocol);
+       entry_val = MLX5_GET(fte_match_set_lyr_2_4, outer_v,
+                            ip_protocol);
+       if (!entry_mask) {
+               MLX5_SET(fte_match_set_lyr_2_4, outer_c, ip_protocol, mask);
+               MLX5_SET(fte_match_set_lyr_2_4, outer_v, ip_protocol, val);
+               goto out;
+       }
+       /* Don't override existing ip protocol */
+       if (mask != entry_mask || val != entry_val)
+               err = -EINVAL;
+ out:
+       return err;
  }
  
  static void set_flow_label(void *misc_c, void *misc_v, u32 mask, u32 val,
@@@ -2631,8 -2658,10 +2658,10 @@@ static int parse_flow_attr(struct mlx5_
                set_tos(headers_c, headers_v,
                        ib_spec->ipv4.mask.tos, ib_spec->ipv4.val.tos);
  
-               set_proto(headers_c, headers_v,
-                         ib_spec->ipv4.mask.proto, ib_spec->ipv4.val.proto);
+               if (set_proto(headers_c, headers_v,
+                             ib_spec->ipv4.mask.proto,
+                             ib_spec->ipv4.val.proto))
+                       return -EINVAL;
                break;
        case IB_FLOW_SPEC_IPV6:
                if (FIELDS_NOT_SUPPORTED(ib_spec->ipv6.mask, LAST_IPV6_FIELD))
                        ib_spec->ipv6.mask.traffic_class,
                        ib_spec->ipv6.val.traffic_class);
  
-               set_proto(headers_c, headers_v,
-                         ib_spec->ipv6.mask.next_hdr,
-                         ib_spec->ipv6.val.next_hdr);
+               if (set_proto(headers_c, headers_v,
+                             ib_spec->ipv6.mask.next_hdr,
+                             ib_spec->ipv6.val.next_hdr))
+                       return -EINVAL;
  
                set_flow_label(misc_params_c, misc_params_v,
                               ntohl(ib_spec->ipv6.mask.flow_label),
                                         LAST_TCP_UDP_FIELD))
                        return -EOPNOTSUPP;
  
-               MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-                        0xff);
-               MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-                        IPPROTO_TCP);
+               if (set_proto(headers_c, headers_v, 0xff, IPPROTO_TCP))
+                       return -EINVAL;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, tcp_sport,
                         ntohs(ib_spec->tcp_udp.mask.src_port));
                                         LAST_TCP_UDP_FIELD))
                        return -EOPNOTSUPP;
  
-               MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
-                        0xff);
-               MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
-                        IPPROTO_UDP);
+               if (set_proto(headers_c, headers_v, 0xff, IPPROTO_UDP))
+                       return -EINVAL;
  
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, udp_sport,
                         ntohs(ib_spec->tcp_udp.mask.src_port));
                if (ib_spec->gre.mask.c_ks_res0_ver)
                        return -EOPNOTSUPP;
  
+               if (set_proto(headers_c, headers_v, 0xff, IPPROTO_GRE))
+                       return -EINVAL;
                MLX5_SET(fte_match_set_lyr_2_4, headers_c, ip_protocol,
                         0xff);
                MLX5_SET(fte_match_set_lyr_2_4, headers_v, ip_protocol,
@@@ -3884,7 -3913,7 +3913,7 @@@ mlx5_ib_raw_fs_rule_add(struct mlx5_ib_
        if (fs_matcher->priority > MLX5_IB_FLOW_LAST_PRIO)
                return ERR_PTR(-ENOMEM);
  
-       dst = kzalloc(sizeof(*dst) * 2, GFP_KERNEL);
+       dst = kcalloc(2, sizeof(*dst), GFP_KERNEL);
        if (!dst)
                return ERR_PTR(-ENOMEM);
  
@@@ -4165,7 -4194,7 +4194,7 @@@ static ssize_t fw_pages_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+               rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
  
        return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
  }
@@@ -4175,7 -4204,7 +4204,7 @@@ static ssize_t reg_pages_show(struct de
                              struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+               rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
  
        return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
  }
@@@ -4185,7 -4214,8 +4214,8 @@@ static ssize_t hca_type_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+               rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
        return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
  }
  static DEVICE_ATTR_RO(hca_type);
@@@ -4194,7 -4224,8 +4224,8 @@@ static ssize_t hw_rev_show(struct devic
                           struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+               rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
        return sprintf(buf, "%x\n", dev->mdev->rev_id);
  }
  static DEVICE_ATTR_RO(hw_rev);
@@@ -4203,7 -4234,8 +4234,8 @@@ static ssize_t board_id_show(struct dev
                             struct device_attribute *attr, char *buf)
  {
        struct mlx5_ib_dev *dev =
-               container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+               rdma_device_to_drv_device(device, struct mlx5_ib_dev, ib_dev);
        return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
                       dev->mdev->board_id);
  }
@@@ -4689,23 -4721,28 +4721,28 @@@ static int create_dev_resources(struct 
  {
        struct ib_srq_init_attr attr;
        struct mlx5_ib_dev *dev;
+       struct ib_device *ibdev;
        struct ib_cq_init_attr cq_attr = {.cqe = 1};
        int port;
        int ret = 0;
  
        dev = container_of(devr, struct mlx5_ib_dev, devr);
+       ibdev = &dev->ib_dev;
  
        mutex_init(&devr->mutex);
  
-       devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
-       if (IS_ERR(devr->p0)) {
-               ret = PTR_ERR(devr->p0);
-               goto error0;
-       }
-       devr->p0->device  = &dev->ib_dev;
+       devr->p0 = rdma_zalloc_drv_obj(ibdev, ib_pd);
+       if (!devr->p0)
+               return -ENOMEM;
+       devr->p0->device  = ibdev;
        devr->p0->uobject = NULL;
        atomic_set(&devr->p0->usecnt, 0);
  
+       ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL);
+       if (ret)
+               goto error0;
        devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
        if (IS_ERR(devr->c0)) {
                ret = PTR_ERR(devr->c0);
@@@ -4803,6 -4840,7 +4840,7 @@@ error2
  error1:
        mlx5_ib_dealloc_pd(devr->p0);
  error0:
+       kfree(devr->p0);
        return ret;
  }
  
@@@ -4818,6 -4856,7 +4856,7 @@@ static void destroy_dev_resources(struc
        mlx5_ib_dealloc_xrcd(devr->x1);
        mlx5_ib_destroy_cq(devr->c0);
        mlx5_ib_dealloc_pd(devr->p0);
+       kfree(devr->p0);
  
        /* Make sure no change P_Key work items are still executing */
        for (port = 0; port < dev->num_ports; ++port)
@@@ -5567,9 -5606,7 +5606,7 @@@ static bool mlx5_ib_bind_slave_port(str
        mpi->mdev_events.notifier_call = mlx5_ib_event_slave_port;
        mlx5_notifier_register(mpi->mdev, &mpi->mdev_events);
  
-       err = mlx5_ib_init_cong_debugfs(ibdev, port_num);
-       if (err)
-               goto unbind;
+       mlx5_ib_init_cong_debugfs(ibdev, port_num);
  
        return true;
  
@@@ -5781,11 -5818,10 +5818,10 @@@ static struct ib_counters *mlx5_ib_crea
  void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
  {
        mlx5_ib_cleanup_multiport_master(dev);
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       cleanup_srcu_struct(&dev->mr_srcu);
-       drain_workqueue(dev->advise_mr_wq);
-       destroy_workqueue(dev->advise_mr_wq);
- #endif
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+               srcu_barrier(&dev->mr_srcu);
+               cleanup_srcu_struct(&dev->mr_srcu);
+       }
        kfree(dev->port);
  }
  
@@@ -5838,19 -5874,11 +5874,11 @@@ int mlx5_ib_stage_init_init(struct mlx5
        spin_lock_init(&dev->memic.memic_lock);
        dev->memic.dev = mdev;
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       dev->advise_mr_wq = alloc_ordered_workqueue("mlx5_ib_advise_mr_wq", 0);
-       if (!dev->advise_mr_wq) {
-               err = -ENOMEM;
-               goto err_mp;
-       }
-       err = init_srcu_struct(&dev->mr_srcu);
-       if (err) {
-               destroy_workqueue(dev->advise_mr_wq);
-               goto err_mp;
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+               err = init_srcu_struct(&dev->mr_srcu);
+               if (err)
+                       goto err_mp;
        }
- #endif
  
        return 0;
  err_mp:
@@@ -5947,6 -5975,8 +5975,8 @@@ static const struct ib_device_ops mlx5_
        .req_notify_cq = mlx5_ib_arm_cq,
        .rereg_user_mr = mlx5_ib_rereg_user_mr,
        .resize_cq = mlx5_ib_resize_cq,
+       INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
  };
  
  static const struct ib_device_ops mlx5_ib_dev_flow_ipsec_ops = {
@@@ -6213,7 -6243,7 +6243,7 @@@ static int mlx5_ib_stage_odp_init(struc
        return mlx5_ib_odp_init_one(dev);
  }
  
- void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
static void mlx5_ib_stage_odp_cleanup(struct mlx5_ib_dev *dev)
  {
        mlx5_ib_odp_cleanup_one(dev);
  }
@@@ -6242,8 -6272,9 +6272,9 @@@ void mlx5_ib_stage_counters_cleanup(str
  
  static int mlx5_ib_stage_cong_debugfs_init(struct mlx5_ib_dev *dev)
  {
-       return mlx5_ib_init_cong_debugfs(dev,
-                                        mlx5_core_native_port_num(dev->mdev) - 1);
+       mlx5_ib_init_cong_debugfs(dev,
+                                 mlx5_core_native_port_num(dev->mdev) - 1);
+       return 0;
  }
  
  static void mlx5_ib_stage_cong_debugfs_cleanup(struct mlx5_ib_dev *dev)
@@@ -6293,7 -6324,7 +6324,7 @@@ int mlx5_ib_stage_ib_reg_init(struct ml
                name = "mlx5_%d";
        else
                name = "mlx5_bond_%d";
-       return ib_register_device(&dev->ib_dev, name, NULL);
+       return ib_register_device(&dev->ib_dev, name);
  }
  
  void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
@@@ -6550,7 -6581,7 +6581,7 @@@ static void *mlx5_ib_add(struct mlx5_co
        if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
                return mlx5_ib_add_slave_port(mdev);
  
-       dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+       dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
        if (!dev)
                return NULL;
  
@@@ -71,10 -71,9 +71,9 @@@ static int destroy_mkey(struct mlx5_ib_
  {
        int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       /* Wait until all page fault handlers using the mr complete. */
-       synchronize_srcu(&dev->mr_srcu);
- #endif
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+               /* Wait until all page fault handlers using the mr complete. */
+               synchronize_srcu(&dev->mr_srcu);
  
        return err;
  }
@@@ -95,10 -94,9 +94,9 @@@ static bool use_umr_mtt_update(struct m
                length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
  }
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
  static void update_odp_mr(struct mlx5_ib_mr *mr)
  {
-       if (mr->umem->is_odp) {
+       if (is_odp_mr(mr)) {
                /*
                 * This barrier prevents the compiler from moving the
                 * setting of umem->odp_data->private to point to our
                smp_wmb();
        }
  }
- #endif
  
  static void reg_mr_callback(int status, struct mlx5_async_work *context)
  {
@@@ -257,9 -254,8 +254,8 @@@ static void remove_keys(struct mlx5_ib_
                mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
        }
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       synchronize_srcu(&dev->mr_srcu);
- #endif
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+               synchronize_srcu(&dev->mr_srcu);
  
        list_for_each_entry_safe(mr, tmp_mr, &del_list, list) {
                list_del(&mr->list);
@@@ -611,52 -607,27 +607,27 @@@ static void mlx5_mr_cache_debugfs_clean
        dev->cache.root = NULL;
  }
  
- static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
  {
        struct mlx5_mr_cache *cache = &dev->cache;
        struct mlx5_cache_ent *ent;
+       struct dentry *dir;
        int i;
  
        if (!mlx5_debugfs_root || dev->rep)
-               return 0;
+               return;
  
        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
-       if (!cache->root)
-               return -ENOMEM;
  
        for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
                ent = &cache->ent[i];
                sprintf(ent->name, "%d", ent->order);
-               ent->dir = debugfs_create_dir(ent->name,  cache->root);
-               if (!ent->dir)
-                       goto err;
-               ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
-                                                &size_fops);
-               if (!ent->fsize)
-                       goto err;
-               ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
-                                                 &limit_fops);
-               if (!ent->flimit)
-                       goto err;
-               ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
-                                              &ent->cur);
-               if (!ent->fcur)
-                       goto err;
-               ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
-                                               &ent->miss);
-               if (!ent->fmiss)
-                       goto err;
+               dir = debugfs_create_dir(ent->name, cache->root);
+               debugfs_create_file("size", 0600, dir, ent, &size_fops);
+               debugfs_create_file("limit", 0600, dir, ent, &limit_fops);
+               debugfs_create_u32("cur", 0400, dir, &ent->cur);
+               debugfs_create_u32("miss", 0600, dir, &ent->miss);
        }
-       return 0;
- err:
-       mlx5_mr_cache_debugfs_cleanup(dev);
-       return -ENOMEM;
  }
  
  static void delay_time_func(struct timer_list *t)
@@@ -670,7 -641,6 +641,6 @@@ int mlx5_mr_cache_init(struct mlx5_ib_d
  {
        struct mlx5_mr_cache *cache = &dev->cache;
        struct mlx5_cache_ent *ent;
-       int err;
        int i;
  
        mutex_init(&dev->slow_path_mutex);
                queue_work(cache->wq, &ent->work);
        }
  
-       err = mlx5_mr_cache_debugfs_init(dev);
-       if (err)
-               mlx5_ib_warn(dev, "cache debugfs failure\n");
-       /*
-        * We don't want to fail driver if debugfs failed to initialize,
-        * so we are not forwarding error to the user.
-        */
+       mlx5_mr_cache_debugfs_init(dev);
  
        return 0;
  }
@@@ -822,18 -785,17 +785,17 @@@ static int mr_cache_max_order(struct ml
        return MLX5_MAX_UMR_SHIFT;
  }
  
- static int mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
-                      int access_flags, struct ib_umem **umem,
-                      int *npages, int *page_shift, int *ncont,
-                      int *order)
+ static int mr_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+                      u64 start, u64 length, int access_flags,
+                      struct ib_umem **umem, int *npages, int *page_shift,
+                      int *ncont, int *order)
  {
-       struct mlx5_ib_dev *dev = to_mdev(pd->device);
        struct ib_umem *u;
        int err;
  
        *umem = NULL;
  
-       u = ib_umem_get(pd->uobject->context, start, length, access_flags, 0);
+       u = ib_umem_get(udata, start, length, access_flags, 0);
        err = PTR_ERR_OR_ZERO(u);
        if (err) {
                mlx5_ib_dbg(dev, "umem get failed (%d)\n", err);
@@@ -1232,7 -1194,8 +1194,7 @@@ static struct ib_mr *mlx5_ib_get_memic_
        MLX5_SET64(mkc, mkc, len, length);
        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
        MLX5_SET(mkc, mkc, qpn, 0xffffff);
 -      MLX5_SET64(mkc, mkc, start_addr,
 -                 memic_addr - pci_resource_start(dev->mdev->pdev, 0));
 +      MLX5_SET64(mkc, mkc, start_addr, memic_addr - dev->mdev->bar_addr);
  
        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
        if (err)
@@@ -1305,21 -1268,20 +1267,20 @@@ struct ib_mr *mlx5_ib_reg_user_mr(struc
        mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
                    start, virt_addr, length, access_flags);
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       if (!start && length == U64_MAX) {
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) && !start &&
+           length == U64_MAX) {
                if (!(access_flags & IB_ACCESS_ON_DEMAND) ||
                    !(dev->odp_caps.general_caps & IB_ODP_SUPPORT_IMPLICIT))
                        return ERR_PTR(-EINVAL);
  
-               mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), access_flags);
+               mr = mlx5_ib_alloc_implicit_mr(to_mpd(pd), udata, access_flags);
                if (IS_ERR(mr))
                        return ERR_CAST(mr);
                return &mr->ibmr;
        }
- #endif
  
-       err = mr_umem_get(pd, start, length, access_flags, &umem, &npages,
-                          &page_shift, &ncont, &order);
+       err = mr_umem_get(dev, udata, start, length, access_flags, &umem,
+                         &npages, &page_shift, &ncont, &order);
  
        if (err < 0)
                return ERR_PTR(err);
        mr->umem = umem;
        set_mr_fields(dev, mr, npages, length, access_flags);
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        update_odp_mr(mr);
- #endif
  
        if (!populate_mtts) {
                int update_xlt_flags = MLX5_IB_UPD_XLT_ENABLE;
                }
        }
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       mr->live = 1;
- #endif
+       if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
+               mr->live = 1;
+               atomic_set(&mr->num_pending_prefetch, 0);
+       }
        return &mr->ibmr;
  error:
        ib_umem_release(umem);
@@@ -1469,8 -1431,9 +1430,9 @@@ int mlx5_ib_rereg_user_mr(struct ib_mr 
                flags |= IB_MR_REREG_TRANS;
                ib_umem_release(mr->umem);
                mr->umem = NULL;
-               err = mr_umem_get(pd, addr, len, access_flags, &mr->umem,
-                                 &npages, &page_shift, &ncont, &order);
+               err = mr_umem_get(dev, udata, addr, len, access_flags,
+                                 &mr->umem, &npages, &page_shift, &ncont,
+                                 &order);
                if (err)
                        goto err;
        }
                }
  
                mr->allocated_from_cache = 0;
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-               mr->live = 1;
- #endif
+               if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING))
+                       mr->live = 1;
        } else {
                /*
                 * Send a UMR WQE
  
        set_mr_fields(dev, mr, npages, len, access_flags);
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
        update_odp_mr(mr);
- #endif
        return 0;
  
  err:
@@@ -1615,12 -1575,19 +1574,19 @@@ static void dereg_mr(struct mlx5_ib_de
        int npages = mr->npages;
        struct ib_umem *umem = mr->umem;
  
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
-       if (umem && umem->is_odp) {
+       if (is_odp_mr(mr)) {
                struct ib_umem_odp *umem_odp = to_ib_umem_odp(umem);
  
-               /* Prevent new page faults from succeeding */
+               /* Prevent new page faults and
+                * prefetch requests from succeeding
+                */
                mr->live = 0;
+               /* dequeue pending prefetch requests for the mr */
+               if (atomic_read(&mr->num_pending_prefetch))
+                       flush_workqueue(system_unbound_wq);
+               WARN_ON(atomic_read(&mr->num_pending_prefetch));
                /* Wait for all running page-fault handlers to finish. */
                synchronize_srcu(&dev->mr_srcu);
                /* Destroy all page mappings */
                /* Avoid double-freeing the umem. */
                umem = NULL;
        }
- #endif
        clean_mr(dev, mr);
  
        /*
@@@ -109,75 -109,173 +109,173 @@@ static int is_sqp(enum ib_qp_type qp_ty
  }
  
  /**
-  * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+  * mlx5_ib_read_user_wqe_common() - Copy a WQE (or part of) from user WQ
+  * to kernel buffer
   *
-  * @qp: QP to copy from.
-  * @send: copy from the send queue when non-zero, use the receive queue
-  *      otherwise.
-  * @wqe_index:  index to start copying from. For send work queues, the
-  *            wqe_index is in units of MLX5_SEND_WQE_BB.
-  *            For receive work queue, it is the number of work queue
-  *            element in the queue.
-  * @buffer: destination buffer.
-  * @length: maximum number of bytes to copy.
+  * @umem: User space memory where the WQ is
+  * @buffer: buffer to copy to
+  * @buflen: buffer length
+  * @wqe_index: index of WQE to copy from
+  * @wq_offset: offset to start of WQ
+  * @wq_wqe_cnt: number of WQEs in WQ
+  * @wq_wqe_shift: log2 of WQE size
+  * @bcnt: number of bytes to copy
+  * @bytes_copied: number of bytes to copy (return value)
   *
-  * Copies at least a single WQE, but may copy more data.
+  * Copies from start of WQE bcnt or less bytes.
+  * Does not gurantee to copy the entire WQE.
   *
-  * Return: the number of bytes copied, or an error code.
+  * Return: zero on success, or an error code.
   */
- int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
-                         void *buffer, u32 length,
-                         struct mlx5_ib_qp_base *base)
+ static int mlx5_ib_read_user_wqe_common(struct ib_umem *umem,
+                                       void *buffer,
+                                       u32 buflen,
+                                       int wqe_index,
+                                       int wq_offset,
+                                       int wq_wqe_cnt,
+                                       int wq_wqe_shift,
+                                       int bcnt,
+                                       size_t *bytes_copied)
+ {
+       size_t offset = wq_offset + ((wqe_index % wq_wqe_cnt) << wq_wqe_shift);
+       size_t wq_end = wq_offset + (wq_wqe_cnt << wq_wqe_shift);
+       size_t copy_length;
+       int ret;
+       /* don't copy more than requested, more than buffer length or
+        * beyond WQ end
+        */
+       copy_length = min_t(u32, buflen, wq_end - offset);
+       copy_length = min_t(u32, copy_length, bcnt);
+       ret = ib_umem_copy_from(buffer, umem, offset, copy_length);
+       if (ret)
+               return ret;
+       if (!ret && bytes_copied)
+               *bytes_copied = copy_length;
+       return 0;
+ }
+ int mlx5_ib_read_user_wqe_sq(struct mlx5_ib_qp *qp,
+                            int wqe_index,
+                            void *buffer,
+                            int buflen,
+                            size_t *bc)
  {
-       struct ib_device *ibdev = qp->ibqp.device;
-       struct mlx5_ib_dev *dev = to_mdev(ibdev);
-       struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
-       size_t offset;
-       size_t wq_end;
+       struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
        struct ib_umem *umem = base->ubuffer.umem;
-       u32 first_copy_length;
-       int wqe_length;
+       struct mlx5_ib_wq *wq = &qp->sq;
+       struct mlx5_wqe_ctrl_seg *ctrl;
+       size_t bytes_copied;
+       size_t bytes_copied2;
+       size_t wqe_length;
        int ret;
+       int ds;
  
-       if (wq->wqe_cnt == 0) {
-               mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
-                           qp->ibqp.qp_type);
+       if (buflen < sizeof(*ctrl))
                return -EINVAL;
-       }
  
-       offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
-       wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
+       /* at first read as much as possible */
+       ret = mlx5_ib_read_user_wqe_common(umem,
+                                          buffer,
+                                          buflen,
+                                          wqe_index,
+                                          wq->offset,
+                                          wq->wqe_cnt,
+                                          wq->wqe_shift,
+                                          buflen,
+                                          &bytes_copied);
+       if (ret)
+               return ret;
  
-       if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
+       /* we need at least control segment size to proceed */
+       if (bytes_copied < sizeof(*ctrl))
                return -EINVAL;
  
-       if (offset > umem->length ||
-           (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
-               return -EINVAL;
+       ctrl = buffer;
+       ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+       wqe_length = ds * MLX5_WQE_DS_UNITS;
+       /* if we copied enough then we are done */
+       if (bytes_copied >= wqe_length) {
+               *bc = bytes_copied;
+               return 0;
+       }
+       /* otherwise this a wrapped around wqe
+        * so read the remaining bytes starting
+        * from  wqe_index 0
+        */
+       ret = mlx5_ib_read_user_wqe_common(umem,
+                                          buffer + bytes_copied,
+                                          buflen - bytes_copied,
+                                          0,
+                                          wq->offset,
+                                          wq->wqe_cnt,
+                                          wq->wqe_shift,
+                                          wqe_length - bytes_copied,
+                                          &bytes_copied2);
  
-       first_copy_length = min_t(u32, offset + length, wq_end) - offset;
-       ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
        if (ret)
                return ret;
+       *bc = bytes_copied + bytes_copied2;
+       return 0;
+ }
  
-       if (send) {
-               struct mlx5_wqe_ctrl_seg *ctrl = buffer;
-               int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
-               wqe_length = ds * MLX5_WQE_DS_UNITS;
-       } else {
-               wqe_length = 1 << wq->wqe_shift;
-       }
+ int mlx5_ib_read_user_wqe_rq(struct mlx5_ib_qp *qp,
+                            int wqe_index,
+                            void *buffer,
+                            int buflen,
+                            size_t *bc)
+ {
+       struct mlx5_ib_qp_base *base = &qp->trans_qp.base;
+       struct ib_umem *umem = base->ubuffer.umem;
+       struct mlx5_ib_wq *wq = &qp->rq;
+       size_t bytes_copied;
+       int ret;
  
-       if (wqe_length <= first_copy_length)
-               return first_copy_length;
+       ret = mlx5_ib_read_user_wqe_common(umem,
+                                          buffer,
+                                          buflen,
+                                          wqe_index,
+                                          wq->offset,
+                                          wq->wqe_cnt,
+                                          wq->wqe_shift,
+                                          buflen,
+                                          &bytes_copied);
  
-       ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
-                               wqe_length - first_copy_length);
        if (ret)
                return ret;
+       *bc = bytes_copied;
+       return 0;
+ }
  
-       return wqe_length;
+ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq,
+                             int wqe_index,
+                             void *buffer,
+                             int buflen,
+                             size_t *bc)
+ {
+       struct ib_umem *umem = srq->umem;
+       size_t bytes_copied;
+       int ret;
+       ret = mlx5_ib_read_user_wqe_common(umem,
+                                          buffer,
+                                          buflen,
+                                          wqe_index,
+                                          0,
+                                          srq->msrq.max,
+                                          srq->msrq.wqe_shift,
+                                          buflen,
+                                          &bytes_copied);
+       if (ret)
+               return ret;
+       *bc = bytes_copied;
+       return 0;
  }
  
  static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
@@@ -435,9 -533,9 +533,9 @@@ static int set_user_buf_size(struct mlx
                return -EINVAL;
        }
  
-       if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
-               mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
-                            ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+       if (ucmd->sq_wqe_count && !is_power_of_2(ucmd->sq_wqe_count)) {
+               mlx5_ib_warn(dev, "sq_wqe_count %d is not a power of two\n",
+                            ucmd->sq_wqe_count);
                return -EINVAL;
        }
  
@@@ -645,16 -743,14 +743,14 @@@ int bfregn_to_uar_index(struct mlx5_ib_
        return bfregi->sys_pages[index_of_sys_page] + offset;
  }
  
- static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev,
-                           struct ib_pd *pd,
+ static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, struct ib_udata *udata,
                            unsigned long addr, size_t size,
-                           struct ib_umem **umem,
-                           int *npages, int *page_shift, int *ncont,
-                           u32 *offset)
+                           struct ib_umem **umem, int *npages, int *page_shift,
+                           int *ncont, u32 *offset)
  {
        int err;
  
-       *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0);
+       *umem = ib_umem_get(udata, addr, size, 0, 0);
        if (IS_ERR(*umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                return PTR_ERR(*umem);
@@@ -695,10 -791,11 +791,11 @@@ static void destroy_user_rq(struct mlx5
  }
  
  static int create_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-                         struct mlx5_ib_rwq *rwq,
+                         struct ib_udata *udata, struct mlx5_ib_rwq *rwq,
                          struct mlx5_ib_create_wq *ucmd)
  {
-       struct mlx5_ib_ucontext *context;
+       struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        int page_shift = 0;
        int npages;
        u32 offset = 0;
        if (!ucmd->buf_addr)
                return -EINVAL;
  
-       context = to_mucontext(pd->uobject->context);
-       rwq->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
-                              rwq->buf_size, 0, 0);
+       rwq->umem = ib_umem_get(udata, ucmd->buf_addr, rwq->buf_size, 0, 0);
        if (IS_ERR(rwq->umem)) {
                mlx5_ib_dbg(dev, "umem_get failed\n");
                err = PTR_ERR(rwq->umem);
                    (unsigned long long)ucmd->buf_addr, rwq->buf_size,
                    npages, page_shift, ncont, offset);
  
-       err = mlx5_ib_db_map_user(context, ucmd->db_addr, &rwq->db);
+       err = mlx5_ib_db_map_user(ucontext, udata, ucmd->db_addr, &rwq->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_umem;
@@@ -783,7 -878,8 +878,8 @@@ static int create_user_qp(struct mlx5_i
                return err;
        }
  
-       context = to_mucontext(pd->uobject->context);
+       context = rdma_udata_to_drv_context(udata, struct mlx5_ib_ucontext,
+                                           ibucontext);
        if (ucmd.flags & MLX5_QP_FLAG_BFREG_INDEX) {
                uar_index = bfregn_to_uar_index(dev, &context->bfregi,
                                                ucmd.bfreg_index, true);
  
        if (ucmd.buf_addr && ubuffer->buf_size) {
                ubuffer->buf_addr = ucmd.buf_addr;
-               err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr,
-                                      ubuffer->buf_size,
-                                      &ubuffer->umem, &npages, &page_shift,
-                                      &ncont, &offset);
+               err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr,
+                                      ubuffer->buf_size, &ubuffer->umem,
+                                      &npages, &page_shift, &ncont, &offset);
                if (err)
                        goto err_bfreg;
        } else {
                resp->bfreg_index = MLX5_IB_INVALID_BFREG;
        qp->bfregn = bfregn;
  
-       err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+       err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &qp->db);
        if (err) {
                mlx5_ib_dbg(dev, "map failed\n");
                goto err_free;
@@@ -1119,6 -1214,7 +1214,7 @@@ static void destroy_flow_rule_vport_sq(
  }
  
  static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
+                                  struct ib_udata *udata,
                                   struct mlx5_ib_sq *sq, void *qpin,
                                   struct ib_pd *pd)
  {
        int ncont = 0;
        u32 offset = 0;
  
-       err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size,
-                              &sq->ubuffer.umem, &npages, &page_shift,
-                              &ncont, &offset);
+       err = mlx5_ib_umem_get(dev, udata, ubuffer->buf_addr, ubuffer->buf_size,
+                              &sq->ubuffer.umem, &npages, &page_shift, &ncont,
+                              &offset);
        if (err)
                return err;
  
@@@ -1362,9 -1458,8 +1458,8 @@@ static int create_raw_packet_qp(struct 
        struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp;
        struct mlx5_ib_sq *sq = &raw_packet_qp->sq;
        struct mlx5_ib_rq *rq = &raw_packet_qp->rq;
-       struct ib_uobject *uobj = pd->uobject;
-       struct ib_ucontext *ucontext = uobj->context;
-       struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+       struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        int err;
        u32 tdn = mucontext->tdn;
        u16 uid = to_mpd(pd)->uid;
                if (err)
                        return err;
  
-               err = create_raw_packet_qp_sq(dev, sq, in, pd);
+               err = create_raw_packet_qp_sq(dev, udata, sq, in, pd);
                if (err)
                        goto err_destroy_tis;
  
@@@ -1478,9 -1573,8 +1573,8 @@@ static int create_rss_raw_qp_tir(struc
                                 struct ib_qp_init_attr *init_attr,
                                 struct ib_udata *udata)
  {
-       struct ib_uobject *uobj = pd->uobject;
-       struct ib_ucontext *ucontext = uobj->context;
-       struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext);
+       struct mlx5_ib_ucontext *mucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_create_qp_resp resp = {};
        int inlen;
        int err;
@@@ -1822,6 -1916,8 +1916,8 @@@ static int create_qp_common(struct mlx5
        int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
        struct mlx5_core_dev *mdev = dev->mdev;
        struct mlx5_ib_create_qp_resp resp = {};
+       struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_cq *send_cq;
        struct mlx5_ib_cq *recv_cq;
        unsigned long flags;
                }
  
                if (!check_flags_mask(ucmd.flags,
+                                     MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
+                                     MLX5_QP_FLAG_BFREG_INDEX |
+                                     MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE |
+                                     MLX5_QP_FLAG_SCATTER_CQE |
                                      MLX5_QP_FLAG_SIGNATURE |
-                                             MLX5_QP_FLAG_SCATTER_CQE |
-                                             MLX5_QP_FLAG_TUNNEL_OFFLOADS |
-                                             MLX5_QP_FLAG_BFREG_INDEX |
-                                             MLX5_QP_FLAG_TYPE_DCT |
-                                             MLX5_QP_FLAG_TYPE_DCI |
-                                             MLX5_QP_FLAG_ALLOW_SCATTER_CQE |
-                                             MLX5_QP_FLAG_PACKET_BASED_CREDIT_MODE))
+                                     MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC |
+                                     MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC |
+                                     MLX5_QP_FLAG_TUNNEL_OFFLOADS |
+                                     MLX5_QP_FLAG_TYPE_DCI |
+                                     MLX5_QP_FLAG_TYPE_DCT))
                        return -EINVAL;
  
-               err = get_qp_user_index(to_mucontext(pd->uobject->context),
-                                       &ucmd, udata->inlen, &uidx);
+               err = get_qp_user_index(ucontext, &ucmd, udata->inlen, &uidx);
                if (err)
                        return err;
  
@@@ -2407,8 -2504,11 +2504,11 @@@ static const char *ib_qp_type_str(enum 
  
  static struct ib_qp *mlx5_ib_create_dct(struct ib_pd *pd,
                                        struct ib_qp_init_attr *attr,
-                                       struct mlx5_ib_create_qp *ucmd)
+                                       struct mlx5_ib_create_qp *ucmd,
+                                       struct ib_udata *udata)
  {
+       struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_qp *qp;
        int err = 0;
        u32 uidx = MLX5_IB_DEFAULT_UIDX;
        if (!attr->srq || !attr->recv_cq)
                return ERR_PTR(-EINVAL);
  
-       err = get_qp_user_index(to_mucontext(pd->uobject->context),
-                               ucmd, sizeof(*ucmd), &uidx);
+       err = get_qp_user_index(ucontext, ucmd, sizeof(*ucmd), &uidx);
        if (err)
                return ERR_PTR(err);
  
@@@ -2500,15 -2599,17 +2599,17 @@@ struct ib_qp *mlx5_ib_create_qp(struct 
        int err;
        struct ib_qp_init_attr mlx_init_attr;
        struct ib_qp_init_attr *init_attr = verbs_init_attr;
+       struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
  
        if (pd) {
                dev = to_mdev(pd->device);
  
                if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
-                       if (!udata) {
+                       if (!ucontext) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n");
                                return ERR_PTR(-EINVAL);
-                       } else if (!to_mucontext(pd->uobject->context)->cqe_version) {
+                       } else if (!ucontext->cqe_version) {
                                mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n");
                                return ERR_PTR(-EINVAL);
                        }
                                return ERR_PTR(-EINVAL);
                        }
                } else {
-                       return mlx5_ib_create_dct(pd, init_attr, &ucmd);
+                       return mlx5_ib_create_dct(pd, init_attr, &ucmd, udata);
                }
        }
  
@@@ -2651,10 -2752,10 +2752,10 @@@ int mlx5_ib_destroy_qp(struct ib_qp *qp
  
  static int to_mlx5_access_flags(struct mlx5_ib_qp *qp,
                                const struct ib_qp_attr *attr,
-                               int attr_mask, __be32 *hw_access_flags)
+                               int attr_mask, __be32 *hw_access_flags_be)
  {
        u8 dest_rd_atomic;
-       u32 access_flags;
+       u32 access_flags, hw_access_flags = 0;
  
        struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
  
                access_flags &= IB_ACCESS_REMOTE_WRITE;
  
        if (access_flags & IB_ACCESS_REMOTE_READ)
-               *hw_access_flags |= MLX5_QP_BIT_RRE;
+               hw_access_flags |= MLX5_QP_BIT_RRE;
        if (access_flags & IB_ACCESS_REMOTE_ATOMIC) {
                int atomic_mode;
  
                if (atomic_mode < 0)
                        return -EOPNOTSUPP;
  
-               *hw_access_flags |= MLX5_QP_BIT_RAE;
-               *hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
+               hw_access_flags |= MLX5_QP_BIT_RAE;
+               hw_access_flags |= atomic_mode << MLX5_ATOMIC_MODE_OFFSET;
        }
  
        if (access_flags & IB_ACCESS_REMOTE_WRITE)
-               *hw_access_flags |= MLX5_QP_BIT_RWE;
+               hw_access_flags |= MLX5_QP_BIT_RWE;
  
-       *hw_access_flags = cpu_to_be32(*hw_access_flags);
+       *hw_access_flags_be = cpu_to_be32(hw_access_flags);
  
        return 0;
  }
@@@ -3178,14 -3279,12 +3279,12 @@@ static int modify_raw_packet_qp(struct 
  static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_pd *pd,
                                    struct mlx5_ib_qp_base *qp_base,
-                                   u8 port_num)
+                                   u8 port_num, struct ib_udata *udata)
  {
-       struct mlx5_ib_ucontext *ucontext = NULL;
+       struct mlx5_ib_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
        unsigned int tx_port_affinity;
  
-       if (pd && pd->ibpd.uobject && pd->ibpd.uobject->context)
-               ucontext = to_mucontext(pd->ibpd.uobject->context);
        if (ucontext) {
                tx_port_affinity = (unsigned int)atomic_add_return(
                                           1, &ucontext->tx_port_affinity) %
  
  static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                               const struct ib_qp_attr *attr, int attr_mask,
-                              enum ib_qp_state cur_state, enum ib_qp_state new_state,
-                              const struct mlx5_ib_modify_qp *ucmd)
+                              enum ib_qp_state cur_state,
+                              enum ib_qp_state new_state,
+                              const struct mlx5_ib_modify_qp *ucmd,
+                              struct ib_udata *udata)
  {
        static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
                [MLX5_QP_STATE_RST] = {
                    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
                        if (dev->lag_active) {
                                u8 p = mlx5_core_native_port_num(dev->mdev);
-                               tx_affinity = get_tx_affinity(dev, pd, base, p);
+                               tx_affinity = get_tx_affinity(dev, pd, base, p,
+                                                             udata);
                                context->flags |= cpu_to_be32(tx_affinity << 24);
                        }
                }
        }
  
        if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
-               __be32 access_flags = 0;
+               __be32 access_flags;
  
                err = to_mlx5_access_flags(qp, attr, attr_mask, &access_flags);
                if (err)
@@@ -3783,7 -3885,7 +3885,7 @@@ int mlx5_ib_modify_qp(struct ib_qp *ibq
        }
  
        err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state,
-                                 new_state, &ucmd);
+                                 new_state, &ucmd, udata);
  
  out:
        mutex_unlock(&qp->mutex);
@@@ -5015,7 -5117,7 +5117,7 @@@ out
                wmb();
  
                /* currently we support only regular doorbells */
 -              mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset, NULL);
 +              mlx5_write64((__be32 *)ctrl, bf->bfreg->map + bf->offset);
                /* Make sure doorbells don't leak out of SQ spinlock
                 * and reach the HCA out of order.
                 */
@@@ -5793,7 -5895,7 +5895,7 @@@ static int prepare_user_rq(struct ib_p
                return err;
        }
  
-       err = create_user_rq(dev, pd, rwq, &ucmd);
+       err = create_user_rq(dev, pd, udata, rwq, &ucmd);
        if (err) {
                mlx5_ib_dbg(dev, "err %d\n", err);
                return err;
@@@ -1347,7 -1347,7 +1347,7 @@@ static void set_wqname(struct mlx5_core
        struct mlx5_cmd *cmd = &dev->cmd;
  
        snprintf(cmd->wq_name, sizeof(cmd->wq_name), "mlx5_cmd_%s",
 -               dev_name(&dev->pdev->dev));
 +               dev->priv.name);
  }
  
  static void clean_debug_files(struct mlx5_core_dev *dev)
@@@ -1585,6 -1585,24 +1585,24 @@@ no_trig
        spin_unlock_irqrestore(&dev->cmd.alloc_lock, flags);
  }
  
+ void mlx5_cmd_flush(struct mlx5_core_dev *dev)
+ {
+       struct mlx5_cmd *cmd = &dev->cmd;
+       int i;
+       for (i = 0; i < cmd->max_reg_cmds; i++)
+               while (down_trylock(&cmd->sem))
+                       mlx5_cmd_trigger_completions(dev);
+       while (down_trylock(&cmd->pages_sem))
+               mlx5_cmd_trigger_completions(dev);
+       /* Unlock cmdif */
+       up(&cmd->pages_sem);
+       for (i = 0; i < cmd->max_reg_cmds; i++)
+               up(&cmd->sem);
+ }
  static int status_to_err(u8 status)
  {
        return status ? -1 : 0; /* TBD more meaningful codes */
@@@ -1884,9 -1902,9 +1902,9 @@@ int mlx5_cmd_init(struct mlx5_core_dev 
        memset(cmd, 0, sizeof(*cmd));
        cmd_if_rev = cmdif_rev(dev);
        if (cmd_if_rev != CMD_IF_REV) {
 -              dev_err(&dev->pdev->dev,
 -                      "Driver cmdif rev(%d) differs from firmware's(%d)\n",
 -                      CMD_IF_REV, cmd_if_rev);
 +              mlx5_core_err(dev,
 +                            "Driver cmdif rev(%d) differs from firmware's(%d)\n",
 +                            CMD_IF_REV, cmd_if_rev);
                return -EINVAL;
        }
  
        cmd->log_sz = cmd_l >> 4 & 0xf;
        cmd->log_stride = cmd_l & 0xf;
        if (1 << cmd->log_sz > MLX5_MAX_COMMANDS) {
 -              dev_err(&dev->pdev->dev, "firmware reports too many outstanding commands %d\n",
 -                      1 << cmd->log_sz);
 +              mlx5_core_err(dev, "firmware reports too many outstanding commands %d\n",
 +                            1 << cmd->log_sz);
                err = -EINVAL;
                goto err_free_page;
        }
  
        if (cmd->log_sz + cmd->log_stride > MLX5_ADAPTER_PAGE_SHIFT) {
 -              dev_err(&dev->pdev->dev, "command queue size overflow\n");
 +              mlx5_core_err(dev, "command queue size overflow\n");
                err = -EINVAL;
                goto err_free_page;
        }
  
        cmd->cmdif_rev = ioread32be(&dev->iseg->cmdif_rev_fw_sub) >> 16;
        if (cmd->cmdif_rev > CMD_IF_REV) {
 -              dev_err(&dev->pdev->dev, "driver does not support command interface version. driver %d, firmware %d\n",
 -                      CMD_IF_REV, cmd->cmdif_rev);
 +              mlx5_core_err(dev, "driver does not support command interface version. driver %d, firmware %d\n",
 +                            CMD_IF_REV, cmd->cmdif_rev);
                err = -EOPNOTSUPP;
                goto err_free_page;
        }
        cmd_h = (u32)((u64)(cmd->dma) >> 32);
        cmd_l = (u32)(cmd->dma);
        if (cmd_l & 0xfff) {
 -              dev_err(&dev->pdev->dev, "invalid command queue address\n");
 +              mlx5_core_err(dev, "invalid command queue address\n");
                err = -ENOMEM;
                goto err_free_page;
        }
        set_wqname(dev);
        cmd->wq = create_singlethread_workqueue(cmd->wq_name);
        if (!cmd->wq) {
 -              dev_err(&dev->pdev->dev, "failed to create command workqueue\n");
 +              mlx5_core_err(dev, "failed to create command workqueue\n");
                err = -ENOMEM;
                goto err_cache;
        }
@@@ -76,15 -76,14 +76,14 @@@ struct page_pool
  #define MLX5_SKB_FRAG_SZ(len) (SKB_DATA_ALIGN(len) +  \
                                 SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
  
+ #define MLX5E_RX_MAX_HEAD (256)
  #define MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev) \
        (6 + MLX5_CAP_GEN(mdev, cache_line_128byte)) /* HW restriction */
  #define MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, req) \
        max_t(u32, MLX5_MPWRQ_MIN_LOG_STRIDE_SZ(mdev), req)
- #define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev)       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 6)
- #define MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, 8)
- #define MLX5E_MPWQE_STRIDE_SZ(mdev, cqe_cmprs) \
-       (cqe_cmprs ? MLX5_MPWRQ_CQE_CMPRS_LOG_STRIDE_SZ(mdev) : \
-       MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev))
+ #define MLX5_MPWRQ_DEF_LOG_STRIDE_SZ(mdev) \
+       MLX5_MPWRQ_LOG_STRIDE_SZ(mdev, order_base_2(MLX5E_RX_MAX_HEAD))
  
  #define MLX5_MPWRQ_LOG_WQE_SZ                 18
  #define MLX5_MPWRQ_WQE_PAGE_ORDER  (MLX5_MPWRQ_LOG_WQE_SZ - PAGE_SHIFT > 0 ? \
  
  #define MLX5E_PARAMS_MINIMUM_LOG_RQ_SIZE_MPW            0x2
  
- #define MLX5E_RX_MAX_HEAD (256)
  #define MLX5E_PARAMS_DEFAULT_LRO_WQE_SZ                 (64 * 1024)
  #define MLX5E_DEFAULT_LRO_TIMEOUT                       32
  #define MLX5E_LRO_TIMEOUT_ARR_SIZE                      4
@@@ -309,16 -306,18 +306,18 @@@ struct mlx5e_cq 
        struct mlx5_core_cq        mcq;
        struct mlx5e_channel      *channel;
  
+       /* control */
+       struct mlx5_core_dev      *mdev;
+       struct mlx5_wq_ctrl        wq_ctrl;
+ } ____cacheline_aligned_in_smp;
+ struct mlx5e_cq_decomp {
        /* cqe decompression */
        struct mlx5_cqe64          title;
        struct mlx5_mini_cqe8      mini_arr[MLX5_MINI_CQE_ARRAY_SIZE];
        u8                         mini_arr_idx;
-       u16                        decmprs_left;
-       u16                        decmprs_wqe_counter;
-       /* control */
-       struct mlx5_core_dev      *mdev;
-       struct mlx5_wq_ctrl        wq_ctrl;
+       u16                        left;
+       u16                        wqe_counter;
  } ____cacheline_aligned_in_smp;
  
  struct mlx5e_tx_wqe_info {
@@@ -388,10 -387,7 +387,7 @@@ struct mlx5e_txqsq 
        struct mlx5e_channel      *channel;
        int                        txq_ix;
        u32                        rate_limit;
-       struct mlx5e_txqsq_recover {
-               struct work_struct         recover_work;
-               u64                        last_recover;
-       } recover;
+       struct work_struct         recover_work;
  } ____cacheline_aligned_in_smp;
  
  struct mlx5e_dma_info {
@@@ -581,6 -577,7 +577,7 @@@ struct mlx5e_rq 
        struct net_device     *netdev;
        struct mlx5e_rq_stats *stats;
        struct mlx5e_cq        cq;
+       struct mlx5e_cq_decomp cqd;
        struct mlx5e_page_cache page_cache;
        struct hwtstamp_config *tstamp;
        struct mlx5_clock      *clock;
@@@ -638,6 -635,7 +635,7 @@@ struct mlx5e_channel 
        struct hwtstamp_config    *tstamp;
        int                        ix;
        int                        cpu;
+       cpumask_var_t              xps_cpumask;
  };
  
  struct mlx5e_channels {
@@@ -657,6 -655,7 +655,7 @@@ struct mlx5e_channel_stats 
  enum {
        MLX5E_STATE_OPENED,
        MLX5E_STATE_DESTROYING,
+       MLX5E_STATE_XDP_TX_ENABLED,
  };
  
  struct mlx5e_rqt {
@@@ -682,6 -681,13 +681,13 @@@ struct mlx5e_rss_params 
        u8      hfunc;
  };
  
+ struct mlx5e_modify_sq_param {
+       int curr_state;
+       int next_state;
+       int rl_update;
+       int rl_index;
+ };
  struct mlx5e_priv {
        /* priv data path fields - start */
        struct mlx5e_txqsq *txq2sq[MLX5E_MAX_NUM_CHANNELS * MLX5E_MAX_NUM_TC];
  #ifdef CONFIG_MLX5_EN_TLS
        struct mlx5e_tls          *tls;
  #endif
+       struct devlink_health_reporter *tx_reporter;
  };
  
  struct mlx5e_profile {
@@@ -803,6 -810,7 +810,7 @@@ mlx5e_skb_from_cqe_nonlinear(struct mlx
  
  void mlx5e_update_stats(struct mlx5e_priv *priv);
  void mlx5e_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats);
+ void mlx5e_fold_sw_stats64(struct mlx5e_priv *priv, struct rtnl_link_stats64 *s);
  
  void mlx5e_init_l2_addr(struct mlx5e_priv *priv);
  int mlx5e_self_test_num(struct mlx5e_priv *priv);
@@@ -850,9 -858,9 +858,9 @@@ void mlx5e_close_channels(struct mlx5e_
   * switching channels
   */
  typedef int (*mlx5e_fp_hw_modify)(struct mlx5e_priv *priv);
void mlx5e_switch_priv_channels(struct mlx5e_priv *priv,
-                               struct mlx5e_channels *new_chs,
-                               mlx5e_fp_hw_modify hw_modify);
int mlx5e_safe_switch_channels(struct mlx5e_priv *priv,
+                              struct mlx5e_channels *new_chs,
+                              mlx5e_fp_hw_modify hw_modify);
  void mlx5e_activate_priv_channels(struct mlx5e_priv *priv);
  void mlx5e_deactivate_priv_channels(struct mlx5e_priv *priv);
  
@@@ -866,6 -874,11 +874,11 @@@ void mlx5e_set_rq_type(struct mlx5_core
  void mlx5e_init_rq_type_params(struct mlx5_core_dev *mdev,
                               struct mlx5e_params *params);
  
+ int mlx5e_modify_sq(struct mlx5_core_dev *mdev, u32 sqn,
+                   struct mlx5e_modify_sq_param *p);
+ void mlx5e_activate_txqsq(struct mlx5e_txqsq *sq);
+ void mlx5e_tx_disable_queue(struct netdev_queue *txq);
  static inline bool mlx5e_tunnel_inner_ft_supported(struct mlx5_core_dev *mdev)
  {
        return (MLX5_CAP_ETH(mdev, tunnel_stateless_gre) &&
@@@ -916,7 -929,7 +929,7 @@@ void mlx5e_notify_hw(struct mlx5_wq_cy
         */
        wmb();
  
 -      mlx5_write64((__be32 *)ctrl, uar_map, NULL);
 +      mlx5_write64((__be32 *)ctrl, uar_map);
  }
  
  static inline void mlx5e_cq_arm(struct mlx5e_cq *cq)
  #include "en.h"
  #include "fs_core.h"
  #include "lib/devcom.h"
+ #include "ecpf.h"
+ #include "lib/eq.h"
  
 -enum {
 -      FDB_FAST_PATH = 0,
 -      FDB_SLOW_PATH
 -};
 -
  /* There are two match-all miss flows, one for unicast dst mac and
   * one for multicast.
   */
  #define fdb_prio_table(esw, chain, prio, level) \
        (esw)->fdb_table.offloads.fdb_prio[(chain)][(prio)][(level)]
  
+ #define UPLINK_REP_INDEX 0
+ /* The rep getter/iterator are only valid after esw->total_vports
+  * and vport->vport are initialized in mlx5_eswitch_init.
+  */
+ #define mlx5_esw_for_all_reps(esw, i, rep)                    \
+       for ((i) = MLX5_VPORT_PF;                               \
+            (rep) = &(esw)->offloads.vport_reps[i],            \
+            (i) < (esw)->total_vports; (i)++)
+ #define mlx5_esw_for_each_vf_rep(esw, i, rep, nvfs)           \
+       for ((i) = MLX5_VPORT_FIRST_VF;                         \
+            (rep) = &(esw)->offloads.vport_reps[i],            \
+            (i) <= (nvfs); (i)++)
+ #define mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, nvfs)   \
+       for ((i) = (nvfs);                                      \
+            (rep) = &(esw)->offloads.vport_reps[i],            \
+            (i) >= MLX5_VPORT_FIRST_VF; (i)--)
+ #define mlx5_esw_for_each_vf_vport(esw, vport, nvfs)          \
+       for ((vport) = MLX5_VPORT_FIRST_VF;                     \
+            (vport) <= (nvfs); (vport)++)
+ #define mlx5_esw_for_each_vf_vport_reverse(esw, vport, nvfs)  \
+       for ((vport) = (nvfs);                                  \
+            (vport) >= MLX5_VPORT_FIRST_VF; (vport)--)
+ static struct mlx5_eswitch_rep *mlx5_eswitch_get_rep(struct mlx5_eswitch *esw,
+                                                    u16 vport_num)
+ {
+       u16 idx = mlx5_eswitch_vport_num_to_index(esw, vport_num);
+       WARN_ON(idx > esw->total_vports - 1);
+       return &esw->offloads.vport_reps[idx];
+ }
  static struct mlx5_flow_table *
  esw_get_prio_table(struct mlx5_eswitch *esw, u32 chain, u16 prio, int level);
  static void
@@@ -160,14 -204,15 +199,15 @@@ mlx5_eswitch_add_offloaded_rule(struct 
                MLX5_SET_TO_ONES(fte_match_set_misc, misc,
                                 source_eswitch_owner_vhca_id);
  
-       if (attr->match_level == MLX5_MATCH_NONE)
-               spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
-       else
-               spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS |
-                                             MLX5_MATCH_MISC_PARAMETERS;
-       if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DECAP)
-               spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS;
+       spec->match_criteria_enable = MLX5_MATCH_MISC_PARAMETERS;
+       if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_DECAP) {
+               if (attr->tunnel_match_level != MLX5_MATCH_NONE)
+                       spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
+               if (attr->match_level != MLX5_MATCH_NONE)
+                       spec->match_criteria_enable |= MLX5_MATCH_INNER_HEADERS;
+       } else if (attr->match_level != MLX5_MATCH_NONE) {
+               spec->match_criteria_enable |= MLX5_MATCH_OUTER_HEADERS;
+       }
  
        if (flow_act.action & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
                flow_act.modify_id = attr->mod_hdr_id;
@@@ -318,7 -363,7 +358,7 @@@ static int esw_set_global_vlan_pop(stru
        esw_debug(esw->dev, "%s applying global %s policy\n", __func__, val ? "pop" : "none");
        for (vf_vport = 1; vf_vport < esw->enabled_vports; vf_vport++) {
                rep = &esw->offloads.vport_reps[vf_vport];
-               if (!rep->rep_if[REP_ETH].valid)
+               if (rep->rep_if[REP_ETH].state != REP_LOADED)
                        continue;
  
                err = __mlx5_eswitch_set_vport_vlan(esw, rep->vport, 0, 0, val);
@@@ -516,7 -561,8 +556,8 @@@ mlx5_eswitch_add_send_to_vport_rule(str
  
        misc = MLX5_ADDR_OF(fte_match_param, spec->match_value, misc_parameters);
        MLX5_SET(fte_match_set_misc, misc, source_sqn, sqn);
-       MLX5_SET(fte_match_set_misc, misc, source_port, 0x0); /* source vport is 0 */
+       /* source vport is the esw manager */
+       MLX5_SET(fte_match_set_misc, misc, source_port, esw->manager_vport);
  
        misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, misc_parameters);
        MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_sqn);
@@@ -561,7 -607,7 +602,7 @@@ static void peer_miss_rules_setup(struc
                         source_eswitch_owner_vhca_id);
  
        dest->type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-       dest->vport.num = 0;
+       dest->vport.num = peer_dev->priv.eswitch->manager_vport;
        dest->vport.vhca_id = MLX5_CAP_GEN(peer_dev, vhca_id);
        dest->vport.flags |= MLX5_FLOW_DEST_VPORT_VHCA_ID;
  }
@@@ -595,14 -641,35 +636,35 @@@ static int esw_add_fdb_peer_miss_rules(
        misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
                            misc_parameters);
  
-       for (i = 1; i < nvports; i++) {
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_PF);
+               flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
+                                          spec, &flow_act, &dest, 1);
+               if (IS_ERR(flow)) {
+                       err = PTR_ERR(flow);
+                       goto add_pf_flow_err;
+               }
+               flows[MLX5_VPORT_PF] = flow;
+       }
+       if (mlx5_ecpf_vport_exists(esw->dev)) {
+               MLX5_SET(fte_match_set_misc, misc, source_port, MLX5_VPORT_ECPF);
+               flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
+                                          spec, &flow_act, &dest, 1);
+               if (IS_ERR(flow)) {
+                       err = PTR_ERR(flow);
+                       goto add_ecpf_flow_err;
+               }
+               flows[mlx5_eswitch_ecpf_idx(esw)] = flow;
+       }
+       mlx5_esw_for_each_vf_vport(esw, i, mlx5_core_max_vfs(esw->dev)) {
                MLX5_SET(fte_match_set_misc, misc, source_port, i);
                flow = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb,
                                           spec, &flow_act, &dest, 1);
                if (IS_ERR(flow)) {
                        err = PTR_ERR(flow);
-                       esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err);
-                       goto add_flow_err;
+                       goto add_vf_flow_err;
                }
                flows[i] = flow;
        }
        kvfree(spec);
        return 0;
  
- add_flow_err:
-       for (i--; i > 0; i--)
+ add_vf_flow_err:
+       nvports = --i;
+       mlx5_esw_for_each_vf_vport_reverse(esw, i, nvports)
                mlx5_del_flow_rules(flows[i]);
+       if (mlx5_ecpf_vport_exists(esw->dev))
+               mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]);
+ add_ecpf_flow_err:
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+               mlx5_del_flow_rules(flows[MLX5_VPORT_PF]);
+ add_pf_flow_err:
+       esw_warn(esw->dev, "FDB: Failed to add peer miss flow rule err %d\n", err);
        kvfree(flows);
  alloc_flows_err:
        kvfree(spec);
@@@ -628,9 -704,15 +699,15 @@@ static void esw_del_fdb_peer_miss_rules
  
        flows = esw->fdb_table.offloads.peer_miss_rules;
  
-       for (i = 1; i < esw->total_vports; i++)
+       mlx5_esw_for_each_vf_vport_reverse(esw, i, mlx5_core_max_vfs(esw->dev))
                mlx5_del_flow_rules(flows[i]);
  
+       if (mlx5_ecpf_vport_exists(esw->dev))
+               mlx5_del_flow_rules(flows[mlx5_eswitch_ecpf_idx(esw)]);
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev))
+               mlx5_del_flow_rules(flows[MLX5_VPORT_PF]);
        kvfree(flows);
  }
  
@@@ -660,7 -742,7 +737,7 @@@ static int esw_add_fdb_miss_rule(struc
        dmac_c[0] = 0x01;
  
        dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT;
-       dest.vport.num = 0;
+       dest.vport.num = esw->manager_vport;
        flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
  
        flow_rule = mlx5_add_flow_rules(esw->fdb_table.offloads.slow_fdb, spec,
@@@ -1168,7 -1250,8 +1245,8 @@@ static int esw_offloads_start(struct ml
  {
        int err, err1, num_vfs = esw->dev->priv.sriov.num_vfs;
  
-       if (esw->mode != SRIOV_LEGACY) {
+       if (esw->mode != SRIOV_LEGACY &&
+           !mlx5_core_is_ecpf_esw_manager(esw->dev)) {
                NL_SET_ERR_MSG_MOD(extack,
                                   "Can't set offloads mode, SRIOV legacy not enabled");
                return -EINVAL;
@@@ -1206,9 -1289,8 +1284,8 @@@ int esw_offloads_init_reps(struct mlx5_
  {
        int total_vfs = MLX5_TOTAL_VPORTS(esw->dev);
        struct mlx5_core_dev *dev = esw->dev;
-       struct mlx5_esw_offload *offloads;
        struct mlx5_eswitch_rep *rep;
-       u8 hw_id[ETH_ALEN];
+       u8 hw_id[ETH_ALEN], rep_type;
        int vport;
  
        esw->offloads.vport_reps = kcalloc(total_vfs,
        if (!esw->offloads.vport_reps)
                return -ENOMEM;
  
-       offloads = &esw->offloads;
        mlx5_query_nic_vport_mac_address(dev, 0, hw_id);
  
-       for (vport = 0; vport < total_vfs; vport++) {
-               rep = &offloads->vport_reps[vport];
-               rep->vport = vport;
+       mlx5_esw_for_all_reps(esw, vport, rep) {
+               rep->vport = mlx5_eswitch_index_to_vport_num(esw, vport);
                ether_addr_copy(rep->hw_id, hw_id);
-       }
  
-       offloads->vport_reps[0].vport = MLX5_VPORT_UPLINK;
+               for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++)
+                       rep->rep_if[rep_type].state = REP_UNREGISTERED;
+       }
  
        return 0;
  }
  
- static void esw_offloads_unload_reps_type(struct mlx5_eswitch *esw, int nvports,
-                                         u8 rep_type)
+ static void __esw_offloads_unload_rep(struct mlx5_eswitch *esw,
+                                     struct mlx5_eswitch_rep *rep, u8 rep_type)
+ {
+       if (rep->rep_if[rep_type].state != REP_LOADED)
+               return;
+       rep->rep_if[rep_type].unload(rep);
+       rep->rep_if[rep_type].state = REP_REGISTERED;
+ }
+ static void __unload_reps_special_vport(struct mlx5_eswitch *esw, u8 rep_type)
  {
        struct mlx5_eswitch_rep *rep;
-       int vport;
  
-       for (vport = nvports - 1; vport >= 0; vport--) {
-               rep = &esw->offloads.vport_reps[vport];
-               if (!rep->rep_if[rep_type].valid)
-                       continue;
+       if (mlx5_ecpf_vport_exists(esw->dev)) {
+               rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_ECPF);
+               __esw_offloads_unload_rep(esw, rep, rep_type);
+       }
  
-               rep->rep_if[rep_type].unload(rep);
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_PF);
+               __esw_offloads_unload_rep(esw, rep, rep_type);
        }
+       rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
+       __esw_offloads_unload_rep(esw, rep, rep_type);
+ }
+ static void __unload_reps_vf_vport(struct mlx5_eswitch *esw, int nvports,
+                                  u8 rep_type)
+ {
+       struct mlx5_eswitch_rep *rep;
+       int i;
+       mlx5_esw_for_each_vf_rep_reverse(esw, i, rep, nvports)
+               __esw_offloads_unload_rep(esw, rep, rep_type);
+ }
+ static void esw_offloads_unload_vf_reps(struct mlx5_eswitch *esw, int nvports)
+ {
+       u8 rep_type = NUM_REP_TYPES;
+       while (rep_type-- > 0)
+               __unload_reps_vf_vport(esw, nvports, rep_type);
+ }
+ static void __unload_reps_all_vport(struct mlx5_eswitch *esw, int nvports,
+                                   u8 rep_type)
+ {
+       __unload_reps_vf_vport(esw, nvports, rep_type);
+       /* Special vports must be the last to unload. */
+       __unload_reps_special_vport(esw, rep_type);
  }
  
- static void esw_offloads_unload_reps(struct mlx5_eswitch *esw, int nvports)
+ static void esw_offloads_unload_all_reps(struct mlx5_eswitch *esw, int nvports)
  {
        u8 rep_type = NUM_REP_TYPES;
  
        while (rep_type-- > 0)
-               esw_offloads_unload_reps_type(esw, nvports, rep_type);
+               __unload_reps_all_vport(esw, nvports, rep_type);
+ }
+ static int __esw_offloads_load_rep(struct mlx5_eswitch *esw,
+                                  struct mlx5_eswitch_rep *rep, u8 rep_type)
+ {
+       int err = 0;
+       if (rep->rep_if[rep_type].state != REP_REGISTERED)
+               return 0;
+       err = rep->rep_if[rep_type].load(esw->dev, rep);
+       if (err)
+               return err;
+       rep->rep_if[rep_type].state = REP_LOADED;
+       return 0;
  }
  
- static int esw_offloads_load_reps_type(struct mlx5_eswitch *esw, int nvports,
-                                      u8 rep_type)
+ static int __load_reps_special_vport(struct mlx5_eswitch *esw, u8 rep_type)
  {
        struct mlx5_eswitch_rep *rep;
-       int vport;
        int err;
  
-       for (vport = 0; vport < nvports; vport++) {
-               rep = &esw->offloads.vport_reps[vport];
-               if (!rep->rep_if[rep_type].valid)
-                       continue;
+       rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
+       err = __esw_offloads_load_rep(esw, rep, rep_type);
+       if (err)
+               return err;
  
-               err = rep->rep_if[rep_type].load(esw->dev, rep);
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_PF);
+               err = __esw_offloads_load_rep(esw, rep, rep_type);
                if (err)
-                       goto err_reps;
+                       goto err_pf;
+       }
+       if (mlx5_ecpf_vport_exists(esw->dev)) {
+               rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_ECPF);
+               err = __esw_offloads_load_rep(esw, rep, rep_type);
+               if (err)
+                       goto err_ecpf;
        }
  
        return 0;
  
+ err_ecpf:
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_PF);
+               __esw_offloads_unload_rep(esw, rep, rep_type);
+       }
+ err_pf:
+       rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
+       __esw_offloads_unload_rep(esw, rep, rep_type);
+       return err;
+ }
+ static int __load_reps_vf_vport(struct mlx5_eswitch *esw, int nvports,
+                               u8 rep_type)
+ {
+       struct mlx5_eswitch_rep *rep;
+       int err, i;
+       mlx5_esw_for_each_vf_rep(esw, i, rep, nvports) {
+               err = __esw_offloads_load_rep(esw, rep, rep_type);
+               if (err)
+                       goto err_vf;
+       }
+       return 0;
+ err_vf:
+       __unload_reps_vf_vport(esw, --i, rep_type);
+       return err;
+ }
+ static int esw_offloads_load_vf_reps(struct mlx5_eswitch *esw, int nvports)
+ {
+       u8 rep_type = 0;
+       int err;
+       for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
+               err = __load_reps_vf_vport(esw, nvports, rep_type);
+               if (err)
+                       goto err_reps;
+       }
+       return err;
  err_reps:
-       esw_offloads_unload_reps_type(esw, vport, rep_type);
+       while (rep_type-- > 0)
+               __unload_reps_vf_vport(esw, nvports, rep_type);
+       return err;
+ }
+ static int __load_reps_all_vport(struct mlx5_eswitch *esw, int nvports,
+                                u8 rep_type)
+ {
+       int err;
+       /* Special vports must be loaded first. */
+       err = __load_reps_special_vport(esw, rep_type);
+       if (err)
+               return err;
+       err = __load_reps_vf_vport(esw, nvports, rep_type);
+       if (err)
+               goto err_vfs;
+       return 0;
+ err_vfs:
+       __unload_reps_special_vport(esw, rep_type);
        return err;
  }
  
- static int esw_offloads_load_reps(struct mlx5_eswitch *esw, int nvports)
+ static int esw_offloads_load_all_reps(struct mlx5_eswitch *esw, int nvports)
  {
        u8 rep_type = 0;
        int err;
  
        for (rep_type = 0; rep_type < NUM_REP_TYPES; rep_type++) {
-               err = esw_offloads_load_reps_type(esw, nvports, rep_type);
+               err = __load_reps_all_vport(esw, nvports, rep_type);
                if (err)
                        goto err_reps;
        }
  
  err_reps:
        while (rep_type-- > 0)
-               esw_offloads_unload_reps_type(esw, nvports, rep_type);
+               __unload_reps_all_vport(esw, nvports, rep_type);
        return err;
  }
  
@@@ -1397,7 -1607,7 +1602,7 @@@ static void esw_offloads_devcom_cleanup
        mlx5_devcom_unregister_component(devcom, MLX5_DEVCOM_ESW_OFFLOADS);
  }
  
int esw_offloads_init(struct mlx5_eswitch *esw, int nvports)
static int esw_offloads_steering_init(struct mlx5_eswitch *esw, int nvports)
  {
        int err;
  
        if (err)
                goto create_fg_err;
  
-       err = esw_offloads_load_reps(esw, nvports);
-       if (err)
-               goto err_reps;
-       esw_offloads_devcom_init(esw);
        return 0;
  
- err_reps:
-       esw_destroy_vport_rx_group(esw);
  create_fg_err:
        esw_destroy_offloads_table(esw);
  
@@@ -1434,6 -1636,95 +1631,95 @@@ create_ft_err
        return err;
  }
  
+ static void esw_offloads_steering_cleanup(struct mlx5_eswitch *esw)
+ {
+       esw_destroy_vport_rx_group(esw);
+       esw_destroy_offloads_table(esw);
+       esw_destroy_offloads_fdb_tables(esw);
+ }
+ static void esw_host_params_event_handler(struct work_struct *work)
+ {
+       struct mlx5_host_work *host_work;
+       struct mlx5_eswitch *esw;
+       int err, num_vf = 0;
+       host_work = container_of(work, struct mlx5_host_work, work);
+       esw = host_work->esw;
+       err = mlx5_query_host_params_num_vfs(esw->dev, &num_vf);
+       if (err || num_vf == esw->host_info.num_vfs)
+               goto out;
+       /* Number of VFs can only change from "0 to x" or "x to 0". */
+       if (esw->host_info.num_vfs > 0) {
+               esw_offloads_unload_vf_reps(esw, esw->host_info.num_vfs);
+       } else {
+               err = esw_offloads_load_vf_reps(esw, num_vf);
+               if (err)
+                       goto out;
+       }
+       esw->host_info.num_vfs = num_vf;
+ out:
+       kfree(host_work);
+ }
+ static int esw_host_params_event(struct notifier_block *nb,
+                                unsigned long type, void *data)
+ {
+       struct mlx5_host_work *host_work;
+       struct mlx5_host_info *host_info;
+       struct mlx5_eswitch *esw;
+       host_work = kzalloc(sizeof(*host_work), GFP_ATOMIC);
+       if (!host_work)
+               return NOTIFY_DONE;
+       host_info = mlx5_nb_cof(nb, struct mlx5_host_info, nb);
+       esw = container_of(host_info, struct mlx5_eswitch, host_info);
+       host_work->esw = esw;
+       INIT_WORK(&host_work->work, esw_host_params_event_handler);
+       queue_work(esw->work_queue, &host_work->work);
+       return NOTIFY_OK;
+ }
+ int esw_offloads_init(struct mlx5_eswitch *esw, int vf_nvports,
+                     int total_nvports)
+ {
+       int err;
+       mutex_init(&esw->fdb_table.offloads.fdb_prio_lock);
+       err = esw_offloads_steering_init(esw, total_nvports);
+       if (err)
+               return err;
+       err = esw_offloads_load_all_reps(esw, vf_nvports);
+       if (err)
+               goto err_reps;
+       esw_offloads_devcom_init(esw);
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               MLX5_NB_INIT(&esw->host_info.nb, esw_host_params_event,
+                            HOST_PARAMS_CHANGE);
+               mlx5_eq_notifier_register(esw->dev, &esw->host_info.nb);
+               esw->host_info.num_vfs = vf_nvports;
+       }
+       return 0;
+ err_reps:
+       esw_offloads_steering_cleanup(esw);
+       return err;
+ }
  static int esw_offloads_stop(struct mlx5_eswitch *esw,
                             struct netlink_ext_ack *extack)
  {
        return err;
  }
  
- void esw_offloads_cleanup(struct mlx5_eswitch *esw, int nvports)
+ void esw_offloads_cleanup(struct mlx5_eswitch *esw)
  {
+       u16 num_vfs;
+       if (mlx5_core_is_ecpf_esw_manager(esw->dev)) {
+               mlx5_eq_notifier_unregister(esw->dev, &esw->host_info.nb);
+               flush_workqueue(esw->work_queue);
+               num_vfs = esw->host_info.num_vfs;
+       } else {
+               num_vfs = esw->dev->priv.sriov.num_vfs;
+       }
        esw_offloads_devcom_cleanup(esw);
-       esw_offloads_unload_reps(esw, nvports);
-       esw_destroy_vport_rx_group(esw);
-       esw_destroy_offloads_table(esw);
-       esw_destroy_offloads_fdb_tables(esw);
+       esw_offloads_unload_all_reps(esw, num_vfs);
+       esw_offloads_steering_cleanup(esw);
  }
  
  static int esw_mode_from_devlink(u16 mode, u16 *mlx5_mode)
@@@ -1548,7 -1847,8 +1842,8 @@@ static int mlx5_devlink_eswitch_check(s
        if(!MLX5_ESWITCH_MANAGER(dev))
                return -EPERM;
  
-       if (dev->priv.eswitch->mode == SRIOV_NONE)
+       if (dev->priv.eswitch->mode == SRIOV_NONE &&
+           !mlx5_core_is_ecpf_esw_manager(dev))
                return -EOPNOTSUPP;
  
        return 0;
@@@ -1760,47 -2060,45 +2055,45 @@@ int mlx5_devlink_eswitch_encap_mode_get
        return 0;
  }
  
- void mlx5_eswitch_register_vport_rep(struct mlx5_eswitch *esw,
-                                    int vport_index,
-                                    struct mlx5_eswitch_rep_if *__rep_if,
-                                    u8 rep_type)
+ void mlx5_eswitch_register_vport_reps(struct mlx5_eswitch *esw,
+                                     struct mlx5_eswitch_rep_if *__rep_if,
+                                     u8 rep_type)
  {
-       struct mlx5_esw_offload *offloads = &esw->offloads;
        struct mlx5_eswitch_rep_if *rep_if;
+       struct mlx5_eswitch_rep *rep;
+       int i;
  
-       rep_if = &offloads->vport_reps[vport_index].rep_if[rep_type];
-       rep_if->load   = __rep_if->load;
-       rep_if->unload = __rep_if->unload;
-       rep_if->get_proto_dev = __rep_if->get_proto_dev;
-       rep_if->priv = __rep_if->priv;
+       mlx5_esw_for_all_reps(esw, i, rep) {
+               rep_if = &rep->rep_if[rep_type];
+               rep_if->load   = __rep_if->load;
+               rep_if->unload = __rep_if->unload;
+               rep_if->get_proto_dev = __rep_if->get_proto_dev;
+               rep_if->priv = __rep_if->priv;
  
-       rep_if->valid = true;
+               rep_if->state = REP_REGISTERED;
+       }
  }
- EXPORT_SYMBOL(mlx5_eswitch_register_vport_rep);
+ EXPORT_SYMBOL(mlx5_eswitch_register_vport_reps);
  
- void mlx5_eswitch_unregister_vport_rep(struct mlx5_eswitch *esw,
-                                      int vport_index, u8 rep_type)
+ void mlx5_eswitch_unregister_vport_reps(struct mlx5_eswitch *esw, u8 rep_type)
  {
-       struct mlx5_esw_offload *offloads = &esw->offloads;
+       u16 max_vf = mlx5_core_max_vfs(esw->dev);
        struct mlx5_eswitch_rep *rep;
+       int i;
  
-       rep = &offloads->vport_reps[vport_index];
-       if (esw->mode == SRIOV_OFFLOADS && esw->vports[vport_index].enabled)
-               rep->rep_if[rep_type].unload(rep);
+       if (esw->mode == SRIOV_OFFLOADS)
+               __unload_reps_all_vport(esw, max_vf, rep_type);
  
-       rep->rep_if[rep_type].valid = false;
+       mlx5_esw_for_all_reps(esw, i, rep)
+               rep->rep_if[rep_type].state = REP_UNREGISTERED;
  }
- EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_rep);
+ EXPORT_SYMBOL(mlx5_eswitch_unregister_vport_reps);
  
  void *mlx5_eswitch_get_uplink_priv(struct mlx5_eswitch *esw, u8 rep_type)
  {
- #define UPLINK_REP_INDEX 0
-       struct mlx5_esw_offload *offloads = &esw->offloads;
        struct mlx5_eswitch_rep *rep;
  
-       rep = &offloads->vport_reps[UPLINK_REP_INDEX];
+       rep = mlx5_eswitch_get_rep(esw, MLX5_VPORT_UPLINK);
        return rep->rep_if[rep_type].priv;
  }
  
@@@ -1808,15 -2106,11 +2101,11 @@@ void *mlx5_eswitch_get_proto_dev(struc
                                 int vport,
                                 u8 rep_type)
  {
-       struct mlx5_esw_offload *offloads = &esw->offloads;
        struct mlx5_eswitch_rep *rep;
  
-       if (vport == MLX5_VPORT_UPLINK)
-               vport = UPLINK_REP_INDEX;
-       rep = &offloads->vport_reps[vport];
+       rep = mlx5_eswitch_get_rep(esw, vport);
  
-       if (rep->rep_if[rep_type].valid &&
+       if (rep->rep_if[rep_type].state == REP_LOADED &&
            rep->rep_if[rep_type].get_proto_dev)
                return rep->rep_if[rep_type].get_proto_dev(rep);
        return NULL;
@@@ -1825,13 -2119,13 +2114,13 @@@ EXPORT_SYMBOL(mlx5_eswitch_get_proto_de
  
  void *mlx5_eswitch_uplink_get_proto_dev(struct mlx5_eswitch *esw, u8 rep_type)
  {
-       return mlx5_eswitch_get_proto_dev(esw, UPLINK_REP_INDEX, rep_type);
+       return mlx5_eswitch_get_proto_dev(esw, MLX5_VPORT_UPLINK, rep_type);
  }
  EXPORT_SYMBOL(mlx5_eswitch_uplink_get_proto_dev);
  
  struct mlx5_eswitch_rep *mlx5_eswitch_vport_rep(struct mlx5_eswitch *esw,
                                                int vport)
  {
-       return &esw->offloads.vport_reps[vport];
+       return mlx5_eswitch_get_rep(esw, vport);
  }
  EXPORT_SYMBOL(mlx5_eswitch_vport_rep);
@@@ -263,10 -263,11 +263,11 @@@ static void nested_down_write_ref_node(
        }
  }
  
- static void down_write_ref_node(struct fs_node *node)
+ static void down_write_ref_node(struct fs_node *node, bool locked)
  {
        if (node) {
-               down_write(&node->lock);
+               if (!locked)
+                       down_write(&node->lock);
                refcount_inc(&node->refcount);
        }
  }
@@@ -277,13 -278,14 +278,14 @@@ static void up_read_ref_node(struct fs_
        up_read(&node->lock);
  }
  
- static void up_write_ref_node(struct fs_node *node)
+ static void up_write_ref_node(struct fs_node *node, bool locked)
  {
        refcount_dec(&node->refcount);
-       up_write(&node->lock);
+       if (!locked)
+               up_write(&node->lock);
  }
  
- static void tree_put_node(struct fs_node *node)
+ static void tree_put_node(struct fs_node *node, bool locked)
  {
        struct fs_node *parent_node = node->parent;
  
                        /* Only root namespace doesn't have parent and we just
                         * need to free its node.
                         */
-                       down_write_ref_node(parent_node);
+                       down_write_ref_node(parent_node, locked);
                        list_del_init(&node->list);
                        if (node->del_sw_func)
                                node->del_sw_func(node);
-                       up_write_ref_node(parent_node);
+                       up_write_ref_node(parent_node, locked);
                } else {
                        kfree(node);
                }
                node = NULL;
        }
        if (!node && parent_node)
-               tree_put_node(parent_node);
+               tree_put_node(parent_node, locked);
  }
  
- static int tree_remove_node(struct fs_node *node)
+ static int tree_remove_node(struct fs_node *node, bool locked)
  {
        if (refcount_read(&node->refcount) > 1) {
                refcount_dec(&node->refcount);
                return -EEXIST;
        }
-       tree_put_node(node);
+       tree_put_node(node, locked);
        return 0;
  }
  
@@@ -398,6 -400,7 +400,7 @@@ static void del_hw_flow_table(struct fs
        fs_get_obj(ft, node);
        dev = get_dev(&ft->node);
        root = find_root(&ft->node);
+       trace_mlx5_fs_del_ft(ft);
  
        if (node->active) {
                err = root->cmds->destroy_flow_table(dev, ft);
@@@ -419,22 -422,34 +422,34 @@@ static void del_sw_flow_table(struct fs
        kfree(ft);
  }
  
- static void del_sw_hw_rule(struct fs_node *node)
+ static void modify_fte(struct fs_fte *fte)
  {
        struct mlx5_flow_root_namespace *root;
-       struct mlx5_flow_rule *rule;
        struct mlx5_flow_table *ft;
        struct mlx5_flow_group *fg;
-       struct fs_fte *fte;
-       int modify_mask;
-       struct mlx5_core_dev *dev = get_dev(node);
+       struct mlx5_core_dev *dev;
        int err;
-       bool update_fte = false;
  
-       fs_get_obj(rule, node);
-       fs_get_obj(fte, rule->node.parent);
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
+       dev = get_dev(&fte->node);
+       root = find_root(&ft->node);
+       err = root->cmds->update_fte(dev, ft, fg->id, fte->modify_mask, fte);
+       if (err)
+               mlx5_core_warn(dev,
+                              "%s can't del rule fg id=%d fte_index=%d\n",
+                              __func__, fg->id, fte->index);
+       fte->modify_mask = 0;
+ }
+ static void del_sw_hw_rule(struct fs_node *node)
+ {
+       struct mlx5_flow_rule *rule;
+       struct fs_fte *fte;
+       fs_get_obj(rule, node);
+       fs_get_obj(fte, rule->node.parent);
        trace_mlx5_fs_del_rule(rule);
        if (rule->sw_action == MLX5_FLOW_CONTEXT_ACTION_FWD_NEXT_PRIO) {
                mutex_lock(&rule->dest_attr.ft->lock);
  
        if (rule->dest_attr.type == MLX5_FLOW_DESTINATION_TYPE_COUNTER  &&
            --fte->dests_size) {
-               modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
-                             BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
+               fte->modify_mask |=
+                       BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_ACTION) |
+                       BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_FLOW_COUNTERS);
                fte->action.action &= ~MLX5_FLOW_CONTEXT_ACTION_COUNT;
-               update_fte = true;
                goto out;
        }
  
        if ((fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) &&
            --fte->dests_size) {
-               modify_mask = BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
-               update_fte = true;
+               fte->modify_mask |=
+                       BIT(MLX5_SET_FTE_MODIFY_ENABLE_MASK_DESTINATION_LIST);
        }
  out:
-       root = find_root(&ft->node);
-       if (update_fte && fte->dests_size) {
-               err = root->cmds->update_fte(dev, ft, fg->id, modify_mask, fte);
-               if (err)
-                       mlx5_core_warn(dev,
-                                      "%s can't del rule fg id=%d fte_index=%d\n",
-                                      __func__, fg->id, fte->index);
-       }
        kfree(rule);
  }
  
@@@ -490,6 -497,7 +497,7 @@@ static void del_hw_fte(struct fs_node *
                        mlx5_core_warn(dev,
                                       "flow steering can't delete fte in index %d of flow group id %d\n",
                                       fte->index, fg->id);
+               node->active = 0;
        }
  }
  
@@@ -590,7 -598,7 +598,7 @@@ static struct fs_fte *alloc_fte(struct 
        fte->node.type =  FS_TYPE_FLOW_ENTRY;
        fte->action = *flow_act;
  
-       tree_init_node(&fte->node, del_hw_fte, del_sw_fte);
+       tree_init_node(&fte->node, NULL, del_sw_fte);
  
        return fte;
  }
@@@ -619,7 -627,8 +627,8 @@@ static struct mlx5_flow_group *alloc_fl
        if (ret) {
                kmem_cache_free(steering->fgs_cache, fg);
                return ERR_PTR(ret);
- }
+       }
        ida_init(&fg->fte_allocator);
        fg->mask.match_criteria_enable = match_criteria_enable;
        memcpy(&fg->mask.match_criteria, match_criteria,
@@@ -810,7 -819,7 +819,7 @@@ static int update_root_ft_create(struc
        struct mlx5_flow_root_namespace *root = find_root(&prio->node);
        struct mlx5_ft_underlay_qp *uqp;
        int min_level = INT_MAX;
 -      int err;
 +      int err = 0;
        u32 qpn;
  
        if (root->root_ft)
@@@ -856,7 -865,7 +865,7 @@@ static int _mlx5_modify_rule_destinatio
        fs_get_obj(fte, rule->node.parent);
        if (!(fte->action.action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST))
                return -EINVAL;
-       down_write_ref_node(&fte->node);
+       down_write_ref_node(&fte->node, false);
        fs_get_obj(fg, fte->node.parent);
        fs_get_obj(ft, fg->node.parent);
  
        root = find_root(&ft->node);
        err = root->cmds->update_fte(get_dev(&ft->node), ft, fg->id,
                                     modify_mask, fte);
-       up_write_ref_node(&fte->node);
+       up_write_ref_node(&fte->node, false);
  
        return err;
  }
@@@ -1014,12 -1023,13 +1023,13 @@@ static struct mlx5_flow_table *__mlx5_c
        if (err)
                goto destroy_ft;
        ft->node.active = true;
-       down_write_ref_node(&fs_prio->node);
+       down_write_ref_node(&fs_prio->node, false);
        tree_add_node(&ft->node, &fs_prio->node);
        list_add_flow_table(ft, fs_prio);
        fs_prio->num_ft++;
-       up_write_ref_node(&fs_prio->node);
+       up_write_ref_node(&fs_prio->node, false);
        mutex_unlock(&root->chain_lock);
+       trace_mlx5_fs_add_ft(ft);
        return ft;
  destroy_ft:
        root->cmds->destroy_flow_table(root->dev, ft);
@@@ -1111,17 -1121,17 +1121,17 @@@ struct mlx5_flow_group *mlx5_create_flo
        if (ft->autogroup.active)
                return ERR_PTR(-EPERM);
  
-       down_write_ref_node(&ft->node);
+       down_write_ref_node(&ft->node, false);
        fg = alloc_insert_flow_group(ft, match_criteria_enable, match_criteria,
                                     start_index, end_index,
                                     ft->node.children.prev);
-       up_write_ref_node(&ft->node);
+       up_write_ref_node(&ft->node, false);
        if (IS_ERR(fg))
                return fg;
  
        err = root->cmds->create_flow_group(dev, ft, fg_in, &fg->id);
        if (err) {
-               tree_put_node(&fg->node);
+               tree_put_node(&fg->node, false);
                return ERR_PTR(err);
        }
        trace_mlx5_fs_add_fg(fg);
@@@ -1518,10 -1528,10 +1528,10 @@@ static void free_match_list(struct matc
                struct match_list *iter, *match_tmp;
  
                list_del(&head->first.list);
-               tree_put_node(&head->first.g->node);
+               tree_put_node(&head->first.g->node, false);
                list_for_each_entry_safe(iter, match_tmp, &head->list,
                                         list) {
-                       tree_put_node(&iter->g->node);
+                       tree_put_node(&iter->g->node, false);
                        list_del(&iter->list);
                        kfree(iter);
                }
@@@ -1598,11 -1608,16 +1608,16 @@@ lookup_fte_locked(struct mlx5_flow_grou
                fte_tmp = NULL;
                goto out;
        }
+       if (!fte_tmp->node.active) {
+               tree_put_node(&fte_tmp->node, false);
+               fte_tmp = NULL;
+               goto out;
+       }
  
        nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD);
  out:
        if (take_write)
-               up_write_ref_node(&g->node);
+               up_write_ref_node(&g->node, false);
        else
                up_read_ref_node(&g->node);
        return fte_tmp;
@@@ -1644,8 -1659,8 +1659,8 @@@ search_again_locked
                        continue;
                rule = add_rule_fg(g, spec->match_value,
                                   flow_act, dest, dest_num, fte_tmp);
-               up_write_ref_node(&fte_tmp->node);
-               tree_put_node(&fte_tmp->node);
+               up_write_ref_node(&fte_tmp->node, false);
+               tree_put_node(&fte_tmp->node, false);
                kmem_cache_free(steering->ftes_cache, fte);
                return rule;
        }
@@@ -1681,7 -1696,7 +1696,7 @@@ skip_search
  
                err = insert_fte(g, fte);
                if (err) {
-                       up_write_ref_node(&g->node);
+                       up_write_ref_node(&g->node, false);
                        if (err == -ENOSPC)
                                continue;
                        kmem_cache_free(steering->ftes_cache, fte);
                }
  
                nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
-               up_write_ref_node(&g->node);
+               up_write_ref_node(&g->node, false);
                rule = add_rule_fg(g, spec->match_value,
                                   flow_act, dest, dest_num, fte);
-               up_write_ref_node(&fte->node);
-               tree_put_node(&fte->node);
+               up_write_ref_node(&fte->node, false);
+               tree_put_node(&fte->node, false);
                return rule;
        }
        rule = ERR_PTR(-ENOENT);
@@@ -1735,7 -1750,7 +1750,7 @@@ search_again_locked
        err = build_match_list(&match_head, ft, spec);
        if (err) {
                if (take_write)
-                       up_write_ref_node(&ft->node);
+                       up_write_ref_node(&ft->node, false);
                else
                        up_read_ref_node(&ft->node);
                return ERR_PTR(err);
        if (!IS_ERR(rule) ||
            (PTR_ERR(rule) != -ENOENT && PTR_ERR(rule) != -EAGAIN)) {
                if (take_write)
-                       up_write_ref_node(&ft->node);
+                       up_write_ref_node(&ft->node, false);
                return rule;
        }
  
        g = alloc_auto_flow_group(ft, spec);
        if (IS_ERR(g)) {
                rule = ERR_CAST(g);
-               up_write_ref_node(&ft->node);
+               up_write_ref_node(&ft->node, false);
                return rule;
        }
  
        nested_down_write_ref_node(&g->node, FS_LOCK_PARENT);
-       up_write_ref_node(&ft->node);
+       up_write_ref_node(&ft->node, false);
  
        err = create_auto_flow_group(ft, g);
        if (err)
        }
  
        nested_down_write_ref_node(&fte->node, FS_LOCK_CHILD);
-       up_write_ref_node(&g->node);
+       up_write_ref_node(&g->node, false);
        rule = add_rule_fg(g, spec->match_value, flow_act, dest,
                           dest_num, fte);
-       up_write_ref_node(&fte->node);
-       tree_put_node(&fte->node);
-       tree_put_node(&g->node);
+       up_write_ref_node(&fte->node, false);
+       tree_put_node(&fte->node, false);
+       tree_put_node(&g->node, false);
        return rule;
  
  err_release_fg:
-       up_write_ref_node(&g->node);
-       tree_put_node(&g->node);
+       up_write_ref_node(&g->node, false);
+       tree_put_node(&g->node, false);
        return ERR_PTR(err);
  }
  
@@@ -1863,10 -1878,33 +1878,33 @@@ EXPORT_SYMBOL(mlx5_add_flow_rules)
  
  void mlx5_del_flow_rules(struct mlx5_flow_handle *handle)
  {
+       struct fs_fte *fte;
        int i;
  
+       /* In order to consolidate the HW changes we lock the FTE for other
+        * changes, and increase its refcount, in order not to perform the
+        * "del" functions of the FTE. Will handle them here.
+        * The removal of the rules is done under locked FTE.
+        * After removing all the handle's rules, if there are remaining
+        * rules, it means we just need to modify the FTE in FW, and
+        * unlock/decrease the refcount we increased before.
+        * Otherwise, it means the FTE should be deleted. First delete the
+        * FTE in FW. Then, unlock the FTE, and proceed the tree_put_node of
+        * the FTE, which will handle the last decrease of the refcount, as
+        * well as required handling of its parent.
+        */
+       fs_get_obj(fte, handle->rule[0]->node.parent);
+       down_write_ref_node(&fte->node, false);
        for (i = handle->num_rules - 1; i >= 0; i--)
-               tree_remove_node(&handle->rule[i]->node);
+               tree_remove_node(&handle->rule[i]->node, true);
+       if (fte->modify_mask && fte->dests_size) {
+               modify_fte(fte);
+               up_write_ref_node(&fte->node, false);
+       } else {
+               del_hw_fte(&fte->node);
+               up_write(&fte->node.lock);
+               tree_put_node(&fte->node, false);
+       }
        kfree(handle);
  }
  EXPORT_SYMBOL(mlx5_del_flow_rules);
@@@ -1969,7 -2007,7 +2007,7 @@@ int mlx5_destroy_flow_table(struct mlx5
                mutex_unlock(&root->chain_lock);
                return err;
        }
-       if (tree_remove_node(&ft->node))
+       if (tree_remove_node(&ft->node, false))
                mlx5_core_warn(get_dev(&ft->node), "Flow table %d wasn't destroyed, refcount > 1\n",
                               ft->id);
        mutex_unlock(&root->chain_lock);
@@@ -1980,7 -2018,7 +2018,7 @@@ EXPORT_SYMBOL(mlx5_destroy_flow_table)
  
  void mlx5_destroy_flow_group(struct mlx5_flow_group *fg)
  {
-       if (tree_remove_node(&fg->node))
+       if (tree_remove_node(&fg->node, false))
                mlx5_core_warn(get_dev(&fg->node), "Flow group %d wasn't destroyed, refcount > 1\n",
                               fg->id);
  }
@@@ -2364,8 -2402,8 +2402,8 @@@ static void clean_tree(struct fs_node *
                tree_get_node(node);
                list_for_each_entry_safe(iter, temp, &node->children, list)
                        clean_tree(iter);
-               tree_put_node(node);
-               tree_remove_node(node);
+               tree_put_node(node, false);
+               tree_remove_node(node, false);
        }
  }
  
@@@ -2478,16 -2516,8 +2516,16 @@@ static int init_fdb_root_ns(struct mlx5
        if (!steering->fdb_sub_ns)
                return -ENOMEM;
  
 +      maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_BYPASS_PATH,
 +                                1);
 +      if (IS_ERR(maj_prio)) {
 +              err = PTR_ERR(maj_prio);
 +              goto out_err;
 +      }
 +
        levels = 2 * FDB_MAX_PRIO * (FDB_MAX_CHAIN + 1);
 -      maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns, 0,
 +      maj_prio = fs_create_prio_chained(&steering->fdb_root_ns->ns,
 +                                        FDB_FAST_PATH,
                                          levels);
        if (IS_ERR(maj_prio)) {
                err = PTR_ERR(maj_prio);
                steering->fdb_sub_ns[chain] = ns;
        }
  
 -      maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, 1, 1);
 +      maj_prio = fs_create_prio(&steering->fdb_root_ns->ns, FDB_SLOW_PATH, 1);
        if (IS_ERR(maj_prio)) {
                err = PTR_ERR(maj_prio);
                goto out_err;
@@@ -103,7 -103,7 +103,7 @@@ void mlx5_enter_error_state(struct mlx5
        mlx5_core_err(dev, "start\n");
        if (pci_channel_offline(dev->pdev) || in_fatal(dev) || force) {
                dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
-               mlx5_cmd_trigger_completions(dev);
+               mlx5_cmd_flush(dev);
        }
  
        mlx5_notifier_call_chain(dev->priv.events, MLX5_DEV_EVENT_SYS_ERROR, (void *)1);
@@@ -152,11 -152,11 +152,11 @@@ static void health_recover(struct work_
  
        nic_state = mlx5_get_nic_state(dev);
        if (nic_state == MLX5_NIC_IFC_INVALID) {
 -              dev_err(&dev->pdev->dev, "health recovery flow aborted since the nic state is invalid\n");
 +              mlx5_core_err(dev, "health recovery flow aborted since the nic state is invalid\n");
                return;
        }
  
 -      dev_err(&dev->pdev->dev, "starting health recovery flow\n");
 +      mlx5_core_err(dev, "starting health recovery flow\n");
        mlx5_recover_device(dev);
  }
  
@@@ -180,8 -180,8 +180,8 @@@ static void health_care(struct work_str
        if (!test_bit(MLX5_DROP_NEW_RECOVERY_WORK, &health->flags))
                schedule_delayed_work(&health->recover_work, recover_delay);
        else
 -              dev_err(&dev->pdev->dev,
 -                      "new health works are not permitted at this stage\n");
 +              mlx5_core_err(dev,
 +                            "new health works are not permitted at this stage\n");
        spin_unlock_irqrestore(&health->wq_lock, flags);
  }
  
@@@ -228,22 -228,18 +228,22 @@@ static void print_health_info(struct ml
                return;
  
        for (i = 0; i < ARRAY_SIZE(h->assert_var); i++)
 -              dev_err(&dev->pdev->dev, "assert_var[%d] 0x%08x\n", i, ioread32be(h->assert_var + i));
 +              mlx5_core_err(dev, "assert_var[%d] 0x%08x\n", i,
 +                            ioread32be(h->assert_var + i));
  
 -      dev_err(&dev->pdev->dev, "assert_exit_ptr 0x%08x\n", ioread32be(&h->assert_exit_ptr));
 -      dev_err(&dev->pdev->dev, "assert_callra 0x%08x\n", ioread32be(&h->assert_callra));
 +      mlx5_core_err(dev, "assert_exit_ptr 0x%08x\n",
 +                    ioread32be(&h->assert_exit_ptr));
 +      mlx5_core_err(dev, "assert_callra 0x%08x\n",
 +                    ioread32be(&h->assert_callra));
        sprintf(fw_str, "%d.%d.%d", fw_rev_maj(dev), fw_rev_min(dev), fw_rev_sub(dev));
 -      dev_err(&dev->pdev->dev, "fw_ver %s\n", fw_str);
 -      dev_err(&dev->pdev->dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
 -      dev_err(&dev->pdev->dev, "irisc_index %d\n", ioread8(&h->irisc_index));
 -      dev_err(&dev->pdev->dev, "synd 0x%x: %s\n", ioread8(&h->synd), hsynd_str(ioread8(&h->synd)));
 -      dev_err(&dev->pdev->dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
 +      mlx5_core_err(dev, "fw_ver %s\n", fw_str);
 +      mlx5_core_err(dev, "hw_id 0x%08x\n", ioread32be(&h->hw_id));
 +      mlx5_core_err(dev, "irisc_index %d\n", ioread8(&h->irisc_index));
 +      mlx5_core_err(dev, "synd 0x%x: %s\n", ioread8(&h->synd),
 +                    hsynd_str(ioread8(&h->synd)));
 +      mlx5_core_err(dev, "ext_synd 0x%04x\n", ioread16be(&h->ext_synd));
        fw = ioread32be(&h->fw_ver);
 -      dev_err(&dev->pdev->dev, "raw fw_ver 0x%08x\n", fw);
 +      mlx5_core_err(dev, "raw fw_ver 0x%08x\n", fw);
  }
  
  static unsigned long get_next_poll_jiffies(void)
@@@ -266,7 -262,8 +266,7 @@@ void mlx5_trigger_health_work(struct ml
        if (!test_bit(MLX5_DROP_NEW_HEALTH_WORK, &health->flags))
                queue_work(health->wq, &health->work);
        else
 -              dev_err(&dev->pdev->dev,
 -                      "new health works are not permitted at this stage\n");
 +              mlx5_core_err(dev, "new health works are not permitted at this stage\n");
        spin_unlock_irqrestore(&health->wq_lock, flags);
  }
  
@@@ -287,7 -284,7 +287,7 @@@ static void poll_health(struct timer_li
  
        health->prev = count;
        if (health->miss_counter == MAX_MISSES) {
 -              dev_err(&dev->pdev->dev, "device's health compromised - reached miss count\n");
 +              mlx5_core_err(dev, "device's health compromised - reached miss count\n");
                print_health_info(dev);
        }
  
@@@ -355,13 -352,6 +355,13 @@@ void mlx5_drain_health_recovery(struct 
        cancel_delayed_work_sync(&dev->priv.health.recover_work);
  }
  
 +void mlx5_health_flush(struct mlx5_core_dev *dev)
 +{
 +      struct mlx5_core_health *health = &dev->priv.health;
 +
 +      flush_workqueue(health->wq);
 +}
 +
  void mlx5_health_cleanup(struct mlx5_core_dev *dev)
  {
        struct mlx5_core_health *health = &dev->priv.health;
@@@ -380,7 -370,7 +380,7 @@@ int mlx5_health_init(struct mlx5_core_d
                return -ENOMEM;
  
        strcpy(name, "mlx5_health");
 -      strcat(name, dev_name(&dev->pdev->dev));
 +      strcat(name, dev->priv.name);
        health->wq = create_singlethread_workqueue(name);
        kfree(name);
        if (!health->wq)
@@@ -465,6 -465,7 +465,7 @@@ static int handle_hca_cap_odp(struct ml
        void *set_hca_cap;
        void *set_ctx;
        int set_sz;
+       bool do_set = false;
        int err;
  
        if (!IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING) ||
        if (err)
                return err;
  
-       if (!(MLX5_CAP_ODP_MAX(dev, ud_odp_caps.srq_receive) ||
-             MLX5_CAP_ODP_MAX(dev, rc_odp_caps.srq_receive) ||
-             MLX5_CAP_ODP_MAX(dev, xrc_odp_caps.srq_receive)))
-               return 0;
        set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in);
        set_ctx = kzalloc(set_sz, GFP_KERNEL);
        if (!set_ctx)
        memcpy(set_hca_cap, dev->caps.hca_cur[MLX5_CAP_ODP],
               MLX5_ST_SZ_BYTES(odp_cap));
  
-       /* set ODP SRQ support for RC/UD and XRC transports */
-       MLX5_SET(odp_cap, set_hca_cap, ud_odp_caps.srq_receive,
-                MLX5_CAP_ODP_MAX(dev, ud_odp_caps.srq_receive));
-       MLX5_SET(odp_cap, set_hca_cap, rc_odp_caps.srq_receive,
-                MLX5_CAP_ODP_MAX(dev, rc_odp_caps.srq_receive));
-       MLX5_SET(odp_cap, set_hca_cap, xrc_odp_caps.srq_receive,
-                MLX5_CAP_ODP_MAX(dev, xrc_odp_caps.srq_receive));
-       err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ODP);
+ #define ODP_CAP_SET_MAX(dev, field)                                            \
+       do {                                                                   \
+               u32 _res = MLX5_CAP_ODP_MAX(dev, field);                       \
+               if (_res) {                                                    \
+                       do_set = true;                                         \
+                       MLX5_SET(odp_cap, set_hca_cap, field, _res);           \
+               }                                                              \
+       } while (0)
+       ODP_CAP_SET_MAX(dev, ud_odp_caps.srq_receive);
+       ODP_CAP_SET_MAX(dev, rc_odp_caps.srq_receive);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.srq_receive);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.send);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.receive);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.write);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.read);
+       ODP_CAP_SET_MAX(dev, xrc_odp_caps.atomic);
+       if (do_set)
+               err = set_caps(dev, set_ctx, set_sz,
+                              MLX5_SET_HCA_CAP_OP_MOD_ODP);
  
        kfree(set_ctx);
        return err;
  }
  
@@@ -580,23 -587,24 +587,23 @@@ query_ex
  
  static int set_hca_cap(struct mlx5_core_dev *dev)
  {
 -      struct pci_dev *pdev = dev->pdev;
        int err;
  
        err = handle_hca_cap(dev);
        if (err) {
 -              dev_err(&pdev->dev, "handle_hca_cap failed\n");
 +              mlx5_core_err(dev, "handle_hca_cap failed\n");
                goto out;
        }
  
        err = handle_hca_cap_atomic(dev);
        if (err) {
 -              dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n");
 +              mlx5_core_err(dev, "handle_hca_cap_atomic failed\n");
                goto out;
        }
  
        err = handle_hca_cap_odp(dev);
        if (err) {
 -              dev_err(&pdev->dev, "handle_hca_cap_odp failed\n");
 +              mlx5_core_err(dev, "handle_hca_cap_odp failed\n");
                goto out;
        }
  
@@@ -728,29 -736,36 +735,29 @@@ static int mlx5_core_set_issi(struct ml
        return -EOPNOTSUPP;
  }
  
 -static int mlx5_pci_init(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 +static int mlx5_pci_init(struct mlx5_core_dev *dev, struct pci_dev *pdev,
 +                       const struct pci_device_id *id)
  {
 -      struct pci_dev *pdev = dev->pdev;
 +      struct mlx5_priv *priv = &dev->priv;
        int err = 0;
  
 -      pci_set_drvdata(dev->pdev, dev);
 -      strncpy(priv->name, dev_name(&pdev->dev), MLX5_MAX_NAME_LEN);
 -      priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
 -
 -      mutex_init(&priv->pgdir_mutex);
 -      INIT_LIST_HEAD(&priv->pgdir_list);
 -      spin_lock_init(&priv->mkey_lock);
 +      dev->pdev = pdev;
 +      priv->pci_dev_data = id->driver_data;
  
 -      mutex_init(&priv->alloc_mutex);
 +      pci_set_drvdata(dev->pdev, dev);
  
 +      dev->bar_addr = pci_resource_start(pdev, 0);
        priv->numa_node = dev_to_node(&dev->pdev->dev);
  
 -      if (mlx5_debugfs_root)
 -              priv->dbg_root =
 -                      debugfs_create_dir(pci_name(pdev), mlx5_debugfs_root);
 -
        err = mlx5_pci_enable_device(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Cannot enable PCI device, aborting\n");
 -              goto err_dbg;
 +              mlx5_core_err(dev, "Cannot enable PCI device, aborting\n");
 +              return err;
        }
  
        err = request_bar(pdev);
        if (err) {
 -              dev_err(&pdev->dev, "error requesting BARs, aborting\n");
 +              mlx5_core_err(dev, "error requesting BARs, aborting\n");
                goto err_disable;
        }
  
  
        err = set_dma_caps(pdev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed setting DMA capabilities mask, aborting\n");
 +              mlx5_core_err(dev, "Failed setting DMA capabilities mask, aborting\n");
                goto err_clr_master;
        }
  
            pci_enable_atomic_ops_to_root(pdev, PCI_EXP_DEVCAP2_ATOMIC_COMP128))
                mlx5_core_dbg(dev, "Enabling pci atomics failed\n");
  
 -      dev->iseg_base = pci_resource_start(dev->pdev, 0);
 +      dev->iseg_base = dev->bar_addr;
        dev->iseg = ioremap(dev->iseg_base, sizeof(*dev->iseg));
        if (!dev->iseg) {
                err = -ENOMEM;
 -              dev_err(&pdev->dev, "Failed mapping initialization segment, aborting\n");
 +              mlx5_core_err(dev, "Failed mapping initialization segment, aborting\n");
                goto err_clr_master;
        }
  
@@@ -782,47 -797,52 +789,47 @@@ err_clr_master
        release_bar(dev->pdev);
  err_disable:
        mlx5_pci_disable_device(dev);
 -
 -err_dbg:
 -      debugfs_remove(priv->dbg_root);
        return err;
  }
  
 -static void mlx5_pci_close(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 +static void mlx5_pci_close(struct mlx5_core_dev *dev)
  {
        iounmap(dev->iseg);
        pci_clear_master(dev->pdev);
        release_bar(dev->pdev);
        mlx5_pci_disable_device(dev);
 -      debugfs_remove_recursive(priv->dbg_root);
  }
  
 -static int mlx5_init_once(struct mlx5_core_dev *dev, struct mlx5_priv *priv)
 +static int mlx5_init_once(struct mlx5_core_dev *dev)
  {
 -      struct pci_dev *pdev = dev->pdev;
        int err;
  
 -      priv->devcom = mlx5_devcom_register_device(dev);
 -      if (IS_ERR(priv->devcom))
 -              dev_err(&pdev->dev, "failed to register with devcom (0x%p)\n",
 -                      priv->devcom);
 +      dev->priv.devcom = mlx5_devcom_register_device(dev);
 +      if (IS_ERR(dev->priv.devcom))
 +              mlx5_core_err(dev, "failed to register with devcom (0x%p)\n",
 +                            dev->priv.devcom);
  
        err = mlx5_query_board_id(dev);
        if (err) {
 -              dev_err(&pdev->dev, "query board id failed\n");
 +              mlx5_core_err(dev, "query board id failed\n");
                goto err_devcom;
        }
  
        err = mlx5_eq_table_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "failed to initialize eq\n");
 +              mlx5_core_err(dev, "failed to initialize eq\n");
                goto err_devcom;
        }
  
        err = mlx5_events_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "failed to initialize events\n");
 +              mlx5_core_err(dev, "failed to initialize events\n");
                goto err_eq_cleanup;
        }
  
        err = mlx5_cq_debugfs_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "failed to initialize cq debugfs\n");
 +              mlx5_core_err(dev, "failed to initialize cq debugfs\n");
                goto err_events_cleanup;
        }
  
  
        err = mlx5_init_rl_table(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init rate limiting\n");
 +              mlx5_core_err(dev, "Failed to init rate limiting\n");
                goto err_tables_cleanup;
        }
  
        err = mlx5_mpfs_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init l2 table %d\n", err);
 +              mlx5_core_err(dev, "Failed to init l2 table %d\n", err);
                goto err_rl_cleanup;
        }
  
        err = mlx5_eswitch_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init eswitch %d\n", err);
 +              mlx5_core_err(dev, "Failed to init eswitch %d\n", err);
                goto err_mpfs_cleanup;
        }
  
        err = mlx5_sriov_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init sriov %d\n", err);
 +              mlx5_core_err(dev, "Failed to init sriov %d\n", err);
                goto err_eswitch_cleanup;
        }
  
        err = mlx5_fpga_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init fpga device %d\n", err);
 +              mlx5_core_err(dev, "Failed to init fpga device %d\n", err);
                goto err_sriov_cleanup;
        }
  
@@@ -912,78 -932,93 +919,78 @@@ static void mlx5_cleanup_once(struct ml
        mlx5_devcom_unregister_device(dev->priv.devcom);
  }
  
 -static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 -                       bool boot)
 +static int mlx5_function_setup(struct mlx5_core_dev *dev, bool boot)
  {
 -      struct pci_dev *pdev = dev->pdev;
        int err;
  
 -      dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev);
 -      mutex_lock(&dev->intf_state_mutex);
 -      if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 -              dev_warn(&dev->pdev->dev, "%s: interface is up, NOP\n",
 -                       __func__);
 -              goto out;
 -      }
 -
 -      dev_info(&pdev->dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
 -               fw_rev_min(dev), fw_rev_sub(dev));
 +      mlx5_core_info(dev, "firmware version: %d.%d.%d\n", fw_rev_maj(dev),
 +                     fw_rev_min(dev), fw_rev_sub(dev));
  
        /* Only PFs hold the relevant PCIe information for this query */
        if (mlx5_core_is_pf(dev))
                pcie_print_link_status(dev->pdev);
  
 -      /* on load removing any previous indication of internal error, device is
 -       * up
 -       */
 -      dev->state = MLX5_DEVICE_STATE_UP;
 -
        /* wait for firmware to accept initialization segments configurations
         */
        err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
        if (err) {
 -              dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
 -                      FW_PRE_INIT_TIMEOUT_MILI);
 -              goto out_err;
 +              mlx5_core_err(dev, "Firmware over %d MS in pre-initializing state, aborting\n",
 +                            FW_PRE_INIT_TIMEOUT_MILI);
 +              return err;
        }
  
        err = mlx5_cmd_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
 -              goto out_err;
 +              mlx5_core_err(dev, "Failed initializing command interface, aborting\n");
 +              return err;
        }
  
        err = wait_fw_init(dev, FW_INIT_TIMEOUT_MILI);
        if (err) {
 -              dev_err(&dev->pdev->dev, "Firmware over %d MS in initializing state, aborting\n",
 -                      FW_INIT_TIMEOUT_MILI);
 +              mlx5_core_err(dev, "Firmware over %d MS in initializing state, aborting\n",
 +                            FW_INIT_TIMEOUT_MILI);
                goto err_cmd_cleanup;
        }
  
        err = mlx5_core_enable_hca(dev, 0);
        if (err) {
 -              dev_err(&pdev->dev, "enable hca failed\n");
 +              mlx5_core_err(dev, "enable hca failed\n");
                goto err_cmd_cleanup;
        }
  
        err = mlx5_core_set_issi(dev);
        if (err) {
 -              dev_err(&pdev->dev, "failed to set issi\n");
 +              mlx5_core_err(dev, "failed to set issi\n");
                goto err_disable_hca;
        }
  
        err = mlx5_satisfy_startup_pages(dev, 1);
        if (err) {
 -              dev_err(&pdev->dev, "failed to allocate boot pages\n");
 +              mlx5_core_err(dev, "failed to allocate boot pages\n");
                goto err_disable_hca;
        }
  
        err = set_hca_ctrl(dev);
        if (err) {
 -              dev_err(&pdev->dev, "set_hca_ctrl failed\n");
 +              mlx5_core_err(dev, "set_hca_ctrl failed\n");
                goto reclaim_boot_pages;
        }
  
        err = set_hca_cap(dev);
        if (err) {
 -              dev_err(&pdev->dev, "set_hca_cap failed\n");
 +              mlx5_core_err(dev, "set_hca_cap failed\n");
                goto reclaim_boot_pages;
        }
  
        err = mlx5_satisfy_startup_pages(dev, 0);
        if (err) {
 -              dev_err(&pdev->dev, "failed to allocate init pages\n");
 +              mlx5_core_err(dev, "failed to allocate init pages\n");
                goto reclaim_boot_pages;
        }
  
        err = mlx5_cmd_init_hca(dev, sw_owner_id);
        if (err) {
 -              dev_err(&pdev->dev, "init hca failed\n");
 +              mlx5_core_err(dev, "init hca failed\n");
                goto reclaim_boot_pages;
        }
  
  
        err = mlx5_query_hca_caps(dev);
        if (err) {
 -              dev_err(&pdev->dev, "query hca failed\n");
 -              goto err_stop_poll;
 +              mlx5_core_err(dev, "query hca failed\n");
 +              goto stop_health;
        }
  
 -      if (boot) {
 -              err = mlx5_init_once(dev, priv);
 -              if (err) {
 -                      dev_err(&pdev->dev, "sw objs init failed\n");
 -                      goto err_stop_poll;
 -              }
 +      return 0;
 +
 +stop_health:
 +      mlx5_stop_health_poll(dev, boot);
 +reclaim_boot_pages:
 +      mlx5_reclaim_startup_pages(dev);
 +err_disable_hca:
 +      mlx5_core_disable_hca(dev, 0);
 +err_cmd_cleanup:
 +      mlx5_cmd_cleanup(dev);
 +
 +      return err;
 +}
 +
 +static int mlx5_function_teardown(struct mlx5_core_dev *dev, bool boot)
 +{
 +      int err;
 +
 +      mlx5_stop_health_poll(dev, boot);
 +      err = mlx5_cmd_teardown_hca(dev);
 +      if (err) {
 +              mlx5_core_err(dev, "tear_down_hca failed, skip cleanup\n");
 +              return err;
        }
 +      mlx5_reclaim_startup_pages(dev);
 +      mlx5_core_disable_hca(dev, 0);
 +      mlx5_cmd_cleanup(dev);
 +
 +      return 0;
 +}
 +
 +static int mlx5_load(struct mlx5_core_dev *dev)
 +{
 +      int err;
  
        dev->priv.uar = mlx5_get_uars_page(dev);
        if (IS_ERR(dev->priv.uar)) {
 -              dev_err(&pdev->dev, "Failed allocating uar, aborting\n");
 +              mlx5_core_err(dev, "Failed allocating uar, aborting\n");
                err = PTR_ERR(dev->priv.uar);
 -              goto err_get_uars;
 +              return err;
        }
  
        mlx5_events_start(dev);
  
        err = mlx5_eq_table_create(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to create EQs\n");
 +              mlx5_core_err(dev, "Failed to create EQs\n");
                goto err_eq_table;
        }
  
        err = mlx5_fw_tracer_init(dev->tracer);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init FW tracer\n");
 +              mlx5_core_err(dev, "Failed to init FW tracer\n");
                goto err_fw_tracer;
        }
  
        err = mlx5_fpga_device_start(dev);
        if (err) {
 -              dev_err(&pdev->dev, "fpga device start failed %d\n", err);
 +              mlx5_core_err(dev, "fpga device start failed %d\n", err);
                goto err_fpga_start;
        }
  
        err = mlx5_accel_ipsec_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "IPSec device start failed %d\n", err);
 +              mlx5_core_err(dev, "IPSec device start failed %d\n", err);
                goto err_ipsec_start;
        }
  
        err = mlx5_accel_tls_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "TLS device start failed %d\n", err);
 +              mlx5_core_err(dev, "TLS device start failed %d\n", err);
                goto err_tls_start;
        }
  
        err = mlx5_init_fs(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init flow steering\n");
 +              mlx5_core_err(dev, "Failed to init flow steering\n");
                goto err_fs;
        }
  
        err = mlx5_core_set_hca_defaults(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to set hca defaults\n");
 +              mlx5_core_err(dev, "Failed to set hca defaults\n");
                goto err_fs;
        }
  
        err = mlx5_sriov_attach(dev);
        if (err) {
 -              dev_err(&pdev->dev, "sriov init failed %d\n", err);
 +              mlx5_core_err(dev, "sriov init failed %d\n", err);
                goto err_sriov;
        }
  
        err = mlx5_ec_init(dev);
        if (err) {
 -              dev_err(&pdev->dev, "Failed to init embedded CPU\n");
 +              mlx5_core_err(dev, "Failed to init embedded CPU\n");
                goto err_ec;
        }
  
 -      if (mlx5_device_registered(dev)) {
 -              mlx5_attach_device(dev);
 -      } else {
 -              err = mlx5_register_device(dev);
 -              if (err) {
 -                      dev_err(&pdev->dev, "mlx5_register_device failed %d\n", err);
 -                      goto err_reg_dev;
 -              }
 -      }
 -
 -      set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 -out:
 -      mutex_unlock(&dev->intf_state_mutex);
 -
        return 0;
  
 -err_reg_dev:
 -      mlx5_ec_cleanup(dev);
 -
  err_ec:
        mlx5_sriov_detach(dev);
 -
  err_sriov:
        mlx5_cleanup_fs(dev);
 -
  err_fs:
        mlx5_accel_tls_cleanup(dev);
 -
  err_tls_start:
        mlx5_accel_ipsec_cleanup(dev);
 -
  err_ipsec_start:
        mlx5_fpga_device_stop(dev);
 -
  err_fpga_start:
        mlx5_fw_tracer_cleanup(dev->tracer);
 -
  err_fw_tracer:
        mlx5_eq_table_destroy(dev);
 -
  err_eq_table:
        mlx5_pagealloc_stop(dev);
        mlx5_events_stop(dev);
 -      mlx5_put_uars_page(dev, priv->uar);
 +      mlx5_put_uars_page(dev, dev->priv.uar);
 +      return err;
 +}
  
 -err_get_uars:
 -      if (boot)
 -              mlx5_cleanup_once(dev);
 +static void mlx5_unload(struct mlx5_core_dev *dev)
 +{
 +      mlx5_ec_cleanup(dev);
 +      mlx5_sriov_detach(dev);
 +      mlx5_cleanup_fs(dev);
 +      mlx5_accel_ipsec_cleanup(dev);
 +      mlx5_accel_tls_cleanup(dev);
 +      mlx5_fpga_device_stop(dev);
 +      mlx5_fw_tracer_cleanup(dev->tracer);
 +      mlx5_eq_table_destroy(dev);
 +      mlx5_pagealloc_stop(dev);
 +      mlx5_events_stop(dev);
 +      mlx5_put_uars_page(dev, dev->priv.uar);
 +}
  
 -err_stop_poll:
 -      mlx5_stop_health_poll(dev, boot);
 -      if (mlx5_cmd_teardown_hca(dev)) {
 -              dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
 -              goto out_err;
 +static int mlx5_load_one(struct mlx5_core_dev *dev, bool boot)
 +{
 +      int err = 0;
 +
 +      dev->caps.embedded_cpu = mlx5_read_embedded_cpu(dev);
 +      mutex_lock(&dev->intf_state_mutex);
 +      if (test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 +              mlx5_core_warn(dev, "interface is up, NOP\n");
 +              goto out;
        }
 +      /* remove any previous indication of internal error */
 +      dev->state = MLX5_DEVICE_STATE_UP;
  
 -reclaim_boot_pages:
 -      mlx5_reclaim_startup_pages(dev);
 +      err = mlx5_function_setup(dev, boot);
 +      if (err)
 +              goto out;
  
 -err_disable_hca:
 -      mlx5_core_disable_hca(dev, 0);
 +      if (boot) {
 +              err = mlx5_init_once(dev);
 +              if (err) {
 +                      mlx5_core_err(dev, "sw objs init failed\n");
 +                      goto function_teardown;
 +              }
 +      }
  
 -err_cmd_cleanup:
 -      mlx5_cmd_cleanup(dev);
 +      err = mlx5_load(dev);
 +      if (err)
 +              goto err_load;
  
 -out_err:
 +      if (mlx5_device_registered(dev)) {
 +              mlx5_attach_device(dev);
 +      } else {
 +              err = mlx5_register_device(dev);
 +              if (err) {
 +                      mlx5_core_err(dev, "register device failed %d\n", err);
 +                      goto err_reg_dev;
 +              }
 +      }
 +
 +      set_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state);
 +out:
 +      mutex_unlock(&dev->intf_state_mutex);
 +
 +      return err;
 +
 +err_reg_dev:
 +      mlx5_unload(dev);
 +err_load:
 +      if (boot)
 +              mlx5_cleanup_once(dev);
 +function_teardown:
 +      mlx5_function_teardown(dev, boot);
        dev->state = MLX5_DEVICE_STATE_INTERNAL_ERROR;
        mutex_unlock(&dev->intf_state_mutex);
  
        return err;
  }
  
 -static int mlx5_unload_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv,
 -                         bool cleanup)
 +static int mlx5_unload_one(struct mlx5_core_dev *dev, bool cleanup)
  {
        int err = 0;
  
  
        mutex_lock(&dev->intf_state_mutex);
        if (!test_bit(MLX5_INTERFACE_STATE_UP, &dev->intf_state)) {
 -              dev_warn(&dev->pdev->dev, "%s: interface is down, NOP\n",
 -                       __func__);
 +              mlx5_core_warn(dev, "%s: interface is down, NOP\n",
 +                             __func__);
                if (cleanup)
                        mlx5_cleanup_once(dev);
                goto out;
        if (mlx5_device_registered(dev))
                mlx5_detach_device(dev);
  
 -      mlx5_ec_cleanup(dev);
 -      mlx5_sriov_detach(dev);
 -      mlx5_cleanup_fs(dev);
 -      mlx5_accel_ipsec_cleanup(dev);
 -      mlx5_accel_tls_cleanup(dev);
 -      mlx5_fpga_device_stop(dev);
 -      mlx5_fw_tracer_cleanup(dev->tracer);
 -      mlx5_eq_table_destroy(dev);
 -      mlx5_pagealloc_stop(dev);
 -      mlx5_events_stop(dev);
 -      mlx5_put_uars_page(dev, priv->uar);
 +      mlx5_unload(dev);
 +
        if (cleanup)
                mlx5_cleanup_once(dev);
 -      mlx5_stop_health_poll(dev, cleanup);
 -
 -      err = mlx5_cmd_teardown_hca(dev);
 -      if (err) {
 -              dev_err(&dev->pdev->dev, "tear_down_hca failed, skip cleanup\n");
 -              goto out;
 -      }
 -      mlx5_reclaim_startup_pages(dev);
 -      mlx5_core_disable_hca(dev, 0);
 -      mlx5_cmd_cleanup(dev);
  
 +      mlx5_function_teardown(dev, cleanup);
  out:
        mutex_unlock(&dev->intf_state_mutex);
        return err;
@@@ -1235,15 -1238,29 +1242,15 @@@ static const struct devlink_ops mlx5_de
  #endif
  };
  
 -#define MLX5_IB_MOD "mlx5_ib"
 -static int init_one(struct pci_dev *pdev,
 -                  const struct pci_device_id *id)
 +static int mlx5_mdev_init(struct mlx5_core_dev *dev, int profile_idx, const char *name)
  {
 -      struct mlx5_core_dev *dev;
 -      struct devlink *devlink;
 -      struct mlx5_priv *priv;
 +      struct mlx5_priv *priv = &dev->priv;
        int err;
  
 -      devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
 -      if (!devlink) {
 -              dev_err(&pdev->dev, "kzalloc failed\n");
 -              return -ENOMEM;
 -      }
 -
 -      dev = devlink_priv(devlink);
 -      priv = &dev->priv;
 -      priv->pci_dev_data = id->driver_data;
 -
 -      pci_set_drvdata(pdev, dev);
 +      strncpy(priv->name, name, MLX5_MAX_NAME_LEN);
 +      priv->name[MLX5_MAX_NAME_LEN - 1] = 0;
  
 -      dev->pdev = pdev;
 -      dev->profile = &profile[prof_sel];
 +      dev->profile = &profile[profile_idx];
  
        INIT_LIST_HEAD(&priv->ctx_list);
        spin_lock_init(&priv->ctx_lock);
        INIT_LIST_HEAD(&priv->bfregs.reg_head.list);
        INIT_LIST_HEAD(&priv->bfregs.wc_head.list);
  
 -      err = mlx5_pci_init(dev, priv);
 -      if (err) {
 -              dev_err(&pdev->dev, "mlx5_pci_init failed with error code %d\n", err);
 -              goto clean_dev;
 +      mutex_init(&priv->alloc_mutex);
 +      mutex_init(&priv->pgdir_mutex);
 +      INIT_LIST_HEAD(&priv->pgdir_list);
 +      spin_lock_init(&priv->mkey_lock);
 +
 +      priv->dbg_root = debugfs_create_dir(name, mlx5_debugfs_root);
 +      if (!priv->dbg_root) {
 +              pr_err("mlx5_core: %s error, Cannot create debugfs dir, aborting\n", name);
 +              return -ENOMEM;
        }
  
        err = mlx5_health_init(dev);
 -      if (err) {
 -              dev_err(&pdev->dev, "mlx5_health_init failed with error code %d\n", err);
 -              goto close_pci;
 -      }
 +      if (err)
 +              goto err_health_init;
  
        err = mlx5_pagealloc_init(dev);
        if (err)
                goto err_pagealloc_init;
  
 -      err = mlx5_load_one(dev, priv, true);
 +      return 0;
 +
 +err_pagealloc_init:
 +      mlx5_health_cleanup(dev);
 +err_health_init:
 +      debugfs_remove(dev->priv.dbg_root);
 +
 +      return err;
 +}
 +
 +static void mlx5_mdev_uninit(struct mlx5_core_dev *dev)
 +{
 +      mlx5_pagealloc_cleanup(dev);
 +      mlx5_health_cleanup(dev);
 +      debugfs_remove_recursive(dev->priv.dbg_root);
 +}
 +
 +#define MLX5_IB_MOD "mlx5_ib"
 +static int init_one(struct pci_dev *pdev, const struct pci_device_id *id)
 +{
 +      struct mlx5_core_dev *dev;
 +      struct devlink *devlink;
 +      int err;
 +
 +      devlink = devlink_alloc(&mlx5_devlink_ops, sizeof(*dev));
 +      if (!devlink) {
 +              dev_err(&pdev->dev, "kzalloc failed\n");
 +              return -ENOMEM;
 +      }
 +
 +      dev = devlink_priv(devlink);
 +
 +      err = mlx5_mdev_init(dev, prof_sel, dev_name(&pdev->dev));
 +      if (err)
 +              goto mdev_init_err;
 +
 +      err = mlx5_pci_init(dev, pdev, id);
 +      if (err) {
 +              mlx5_core_err(dev, "mlx5_pci_init failed with error code %d\n",
 +                            err);
 +              goto pci_init_err;
 +      }
 +
 +      err = mlx5_load_one(dev, true);
        if (err) {
 -              dev_err(&pdev->dev, "mlx5_load_one failed with error code %d\n", err);
 +              mlx5_core_err(dev, "mlx5_load_one failed with error code %d\n",
 +                            err);
                goto err_load_one;
        }
  
        return 0;
  
  clean_load:
 -      mlx5_unload_one(dev, priv, true);
 +      mlx5_unload_one(dev, true);
 +
  err_load_one:
 -      mlx5_pagealloc_cleanup(dev);
 -err_pagealloc_init:
 -      mlx5_health_cleanup(dev);
 -close_pci:
 -      mlx5_pci_close(dev, priv);
 -clean_dev:
 +      mlx5_pci_close(dev);
 +pci_init_err:
 +      mlx5_mdev_uninit(dev);
 +mdev_init_err:
        devlink_free(devlink);
  
        return err;
@@@ -1350,18 -1321,20 +1357,18 @@@ static void remove_one(struct pci_dev *
  {
        struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
        struct devlink *devlink = priv_to_devlink(dev);
 -      struct mlx5_priv *priv = &dev->priv;
  
        devlink_unregister(devlink);
        mlx5_unregister_device(dev);
  
 -      if (mlx5_unload_one(dev, priv, true)) {
 -              dev_err(&dev->pdev->dev, "mlx5_unload_one failed\n");
 -              mlx5_health_cleanup(dev);
 +      if (mlx5_unload_one(dev, true)) {
 +              mlx5_core_err(dev, "mlx5_unload_one failed\n");
 +              mlx5_health_flush(dev);
                return;
        }
  
 -      mlx5_pagealloc_cleanup(dev);
 -      mlx5_health_cleanup(dev);
 -      mlx5_pci_close(dev, priv);
 +      mlx5_pci_close(dev);
 +      mlx5_mdev_uninit(dev);
        devlink_free(devlink);
  }
  
@@@ -1369,11 -1342,12 +1376,11 @@@ static pci_ers_result_t mlx5_pci_err_de
                                              pci_channel_state_t state)
  {
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
 -      struct mlx5_priv *priv = &dev->priv;
  
 -      dev_info(&pdev->dev, "%s was called\n", __func__);
 +      mlx5_core_info(dev, "%s was called\n", __func__);
  
        mlx5_enter_error_state(dev, false);
 -      mlx5_unload_one(dev, priv, false);
 +      mlx5_unload_one(dev, false);
        /* In case of kernel call drain the health wq */
        if (state) {
                mlx5_drain_health_wq(dev);
@@@ -1400,9 -1374,7 +1407,9 @@@ static int wait_vital(struct pci_dev *p
                count = ioread32be(health->health_counter);
                if (count && count != 0xffffffff) {
                        if (last_count && last_count != count) {
 -                              dev_info(&pdev->dev, "Counter value 0x%x after %d iterations\n", count, i);
 +                              mlx5_core_info(dev,
 +                                             "wait vital counter value 0x%x after %d iterations\n",
 +                                             count, i);
                                return 0;
                        }
                        last_count = count;
@@@ -1418,12 -1390,12 +1425,12 @@@ static pci_ers_result_t mlx5_pci_slot_r
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
        int err;
  
 -      dev_info(&pdev->dev, "%s was called\n", __func__);
 +      mlx5_core_info(dev, "%s was called\n", __func__);
  
        err = mlx5_pci_enable_device(dev);
        if (err) {
 -              dev_err(&pdev->dev, "%s: mlx5_pci_enable_device failed with error code: %d\n"
 -                      , __func__, err);
 +              mlx5_core_err(dev, "%s: mlx5_pci_enable_device failed with error code: %d\n",
 +                            __func__, err);
                return PCI_ERS_RESULT_DISCONNECT;
        }
  
        pci_save_state(pdev);
  
        if (wait_vital(pdev)) {
 -              dev_err(&pdev->dev, "%s: wait_vital timed out\n", __func__);
 +              mlx5_core_err(dev, "%s: wait_vital timed out\n", __func__);
                return PCI_ERS_RESULT_DISCONNECT;
        }
  
  static void mlx5_pci_resume(struct pci_dev *pdev)
  {
        struct mlx5_core_dev *dev = pci_get_drvdata(pdev);
 -      struct mlx5_priv *priv = &dev->priv;
        int err;
  
 -      dev_info(&pdev->dev, "%s was called\n", __func__);
 +      mlx5_core_info(dev, "%s was called\n", __func__);
  
 -      err = mlx5_load_one(dev, priv, false);
 +      err = mlx5_load_one(dev, false);
        if (err)
 -              dev_err(&pdev->dev, "%s: mlx5_load_one failed with error code: %d\n"
 -                      , __func__, err);
 +              mlx5_core_err(dev, "%s: mlx5_load_one failed with error code: %d\n",
 +                            __func__, err);
        else
 -              dev_info(&pdev->dev, "%s: device recovered\n", __func__);
 +              mlx5_core_info(dev, "%s: device recovered\n", __func__);
  }
  
  static const struct pci_error_handlers mlx5_err_handler = {
@@@ -1513,12 -1486,13 +1520,12 @@@ succeed
  static void shutdown(struct pci_dev *pdev)
  {
        struct mlx5_core_dev *dev  = pci_get_drvdata(pdev);
 -      struct mlx5_priv *priv = &dev->priv;
        int err;
  
 -      dev_info(&pdev->dev, "Shutdown was called\n");
 +      mlx5_core_info(dev, "Shutdown was called\n");
        err = mlx5_try_fast_unload(dev);
        if (err)
 -              mlx5_unload_one(dev, priv, false);
 +              mlx5_unload_one(dev, false);
        mlx5_pci_disable_device(dev);
  }
  
@@@ -1535,6 -1509,8 +1542,8 @@@ static const struct pci_device_id mlx5_
        { PCI_VDEVICE(MELLANOX, 0x101a), MLX5_PCI_DEV_IS_VF},   /* ConnectX-5 Ex VF */
        { PCI_VDEVICE(MELLANOX, 0x101b) },                      /* ConnectX-6 */
        { PCI_VDEVICE(MELLANOX, 0x101c), MLX5_PCI_DEV_IS_VF},   /* ConnectX-6 VF */
+       { PCI_VDEVICE(MELLANOX, 0x101d) },                      /* ConnectX-6 Dx */
+       { PCI_VDEVICE(MELLANOX, 0x101e), MLX5_PCI_DEV_IS_VF},   /* ConnectX Family mlx5Gen Virtual Function */
        { PCI_VDEVICE(MELLANOX, 0xa2d2) },                      /* BlueField integrated ConnectX-5 network controller */
        { PCI_VDEVICE(MELLANOX, 0xa2d3), MLX5_PCI_DEV_IS_VF},   /* BlueField integrated ConnectX-5 network controller VF */
        { 0, }
  extern uint mlx5_core_debug_mask;
  
  #define mlx5_core_dbg(__dev, format, ...)                             \
 -      dev_dbg(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,         \
 +      pr_debug("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,      \
                 __func__, __LINE__, current->pid,                      \
                 ##__VA_ARGS__)
  
  #define mlx5_core_dbg_once(__dev, format, ...)                                \
 -      dev_dbg_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,    \
 +      pr_debug_once("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
                     __func__, __LINE__, current->pid,                  \
                     ##__VA_ARGS__)
  
@@@ -64,37 -64,28 +64,37 @@@ do {                                                                       
  } while (0)
  
  #define mlx5_core_err(__dev, format, ...)                             \
 -      dev_err(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format, \
 +      pr_err("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,        \
                __func__, __LINE__, current->pid,       \
               ##__VA_ARGS__)
  
 -#define mlx5_core_err_rl(__dev, format, ...)                          \
 -      dev_err_ratelimited(&(__dev)->pdev->dev,                        \
 -                         "%s:%d:(pid %d): " format,                   \
 -                         __func__, __LINE__, current->pid,            \
 +#define mlx5_core_err_rl(__dev, format, ...)                               \
 +      pr_err_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
 +                         __func__, __LINE__, current->pid,                 \
                           ##__VA_ARGS__)
  
  #define mlx5_core_warn(__dev, format, ...)                            \
 -      dev_warn(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,        \
 +      pr_warn("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,       \
                 __func__, __LINE__, current->pid,                      \
                ##__VA_ARGS__)
  
  #define mlx5_core_warn_once(__dev, format, ...)                               \
 -      dev_warn_once(&(__dev)->pdev->dev, "%s:%d:(pid %d): " format,   \
 +      pr_warn_once("%s:%s:%d:(pid %d): " format, (__dev)->priv.name,  \
                      __func__, __LINE__, current->pid,                 \
                      ##__VA_ARGS__)
  
 +#define mlx5_core_warn_rl(__dev, format, ...)                               \
 +      pr_warn_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
 +                         __func__, __LINE__, current->pid,                  \
 +                         ##__VA_ARGS__)
 +
  #define mlx5_core_info(__dev, format, ...)                            \
 -      dev_info(&(__dev)->pdev->dev, format, ##__VA_ARGS__)
 +      pr_info("%s " format, (__dev)->priv.name, ##__VA_ARGS__)
 +
 +#define mlx5_core_info_rl(__dev, format, ...)                               \
 +      pr_info_ratelimited("%s:%s:%d:(pid %d): " format, (__dev)->priv.name, \
 +                         __func__, __LINE__, current->pid,                  \
 +                         ##__VA_ARGS__)
  
  enum {
        MLX5_CMD_DATA, /* print command payload only */
@@@ -135,6 -126,7 +135,7 @@@ u64 mlx5_read_internal_timer(struct mlx
                             struct ptp_system_timestamp *sts);
  
  void mlx5_cmd_trigger_completions(struct mlx5_core_dev *dev);
+ void mlx5_cmd_flush(struct mlx5_core_dev *dev);
  int mlx5_cq_debugfs_init(struct mlx5_core_dev *dev);
  void mlx5_cq_debugfs_cleanup(struct mlx5_core_dev *dev);
  
@@@ -79,7 -79,7 +79,7 @@@ static u64 uar2pfn(struct mlx5_core_de
        else
                system_page_index = index;
  
 -      return (pci_resource_start(mdev->pdev, 0) >> PAGE_SHIFT) + system_page_index;
 +      return (mdev->bar_addr >> PAGE_SHIFT) + system_page_index;
  }
  
  static void up_rel_func(struct kref *kref)
@@@ -90,8 -90,8 +90,8 @@@
        iounmap(up->map);
        if (mlx5_cmd_free_uar(up->mdev, up->index))
                mlx5_core_warn(up->mdev, "failed to free uar index %d\n", up->index);
-       kfree(up->reg_bitmap);
-       kfree(up->fp_bitmap);
+       bitmap_free(up->reg_bitmap);
+       bitmap_free(up->fp_bitmap);
        kfree(up);
  }
  
@@@ -110,11 -110,11 +110,11 @@@ static struct mlx5_uars_page *alloc_uar
                return ERR_PTR(err);
  
        up->mdev = mdev;
-       up->reg_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+       up->reg_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL);
        if (!up->reg_bitmap)
                goto error1;
  
-       up->fp_bitmap = kcalloc(BITS_TO_LONGS(bfregs), sizeof(unsigned long), GFP_KERNEL);
+       up->fp_bitmap = bitmap_zalloc(bfregs, GFP_KERNEL);
        if (!up->fp_bitmap)
                goto error1;
  
@@@ -157,8 -157,8 +157,8 @@@ error2
        if (mlx5_cmd_free_uar(mdev, up->index))
                mlx5_core_warn(mdev, "failed to free uar index %d\n", up->index);
  error1:
-       kfree(up->fp_bitmap);
-       kfree(up->reg_bitmap);
+       bitmap_free(up->fp_bitmap);
+       bitmap_free(up->reg_bitmap);
        kfree(up);
        return ERR_PTR(err);
  }
@@@ -133,7 -133,6 +133,7 @@@ enum 
        MLX5_REG_MTRC_CONF       = 0x9041,
        MLX5_REG_MTRC_STDB       = 0x9042,
        MLX5_REG_MTRC_CTRL       = 0x9043,
 +      MLX5_REG_MPEIN           = 0x9050,
        MLX5_REG_MPCNT           = 0x9051,
        MLX5_REG_MTPPS           = 0x9053,
        MLX5_REG_MTPPSE          = 0x9054,
@@@ -196,6 -195,7 +196,7 @@@ struct mlx5_rsc_debug 
  
  enum mlx5_dev_event {
        MLX5_DEV_EVENT_SYS_ERROR = 128, /* 0 - 127 are FW events */
+       MLX5_DEV_EVENT_PORT_AFFINITY = 129,
  };
  
  enum mlx5_port_status {
@@@ -365,6 -365,7 +366,7 @@@ struct mlx5_core_sig_ctx 
  enum {
        MLX5_MKEY_MR = 1,
        MLX5_MKEY_MW,
+       MLX5_MKEY_INDIRECT_DEVX,
  };
  
  struct mlx5_core_mkey {
@@@ -659,7 -660,6 +661,7 @@@ struct mlx5_core_dev 
        u64                     sys_image_guid;
        phys_addr_t             iseg_base;
        struct mlx5_init_seg __iomem *iseg;
 +      phys_addr_t             bar_addr;
        enum mlx5_device_state  state;
        /* sync interface state */
        struct mutex            intf_state_mutex;
@@@ -885,7 -885,6 +887,7 @@@ void mlx5_cmd_mbox_status(void *out, u
  int mlx5_core_get_caps(struct mlx5_core_dev *dev, enum mlx5_cap_type cap_type);
  int mlx5_cmd_alloc_uar(struct mlx5_core_dev *dev, u32 *uarn);
  int mlx5_cmd_free_uar(struct mlx5_core_dev *dev, u32 uarn);
 +void mlx5_health_flush(struct mlx5_core_dev *dev);
  void mlx5_health_cleanup(struct mlx5_core_dev *dev);
  int mlx5_health_init(struct mlx5_core_dev *dev);
  void mlx5_start_health_poll(struct mlx5_core_dev *dev);
@@@ -962,10 -961,6 +964,6 @@@ int mlx5_query_odp_caps(struct mlx5_cor
                        struct mlx5_odp_caps *odp_caps);
  int mlx5_core_query_ib_ppcnt(struct mlx5_core_dev *dev,
                             u8 port_num, void *out, size_t sz);
- #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
- int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 token,
-                               u32 wq_num, u8 type, int error);
- #endif
  
  int mlx5_init_rl_table(struct mlx5_core_dev *dev);
  void mlx5_cleanup_rl_table(struct mlx5_core_dev *dev);
@@@ -1044,6 -1039,7 +1042,7 @@@ int mlx5_cmd_create_vport_lag(struct ml
  int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_roce(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev);
+ bool mlx5_lag_is_multipath(struct mlx5_core_dev *dev);
  bool mlx5_lag_is_active(struct mlx5_core_dev *dev);
  struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev);
  int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
@@@ -1091,6 -1087,11 +1090,11 @@@ static inline bool mlx5_core_is_ecpf_es
        return dev->caps.embedded_cpu && MLX5_CAP_GEN(dev, eswitch_manager);
  }
  
+ static inline bool mlx5_ecpf_vport_exists(struct mlx5_core_dev *dev)
+ {
+       return mlx5_core_is_pf(dev) && MLX5_CAP_ESW(dev, ecpf_vport_exists);
+ }
  #define MLX5_HOST_PF_MAX_VFS  (127u)
  static inline u16 mlx5_core_max_vfs(struct mlx5_core_dev *dev)
  {
@@@ -631,7 -631,8 +631,8 @@@ struct mlx5_ifc_e_switch_cap_bits 
        u8         vport_svlan_insert[0x1];
        u8         vport_cvlan_insert_if_not_exist[0x1];
        u8         vport_cvlan_insert_overwrite[0x1];
-       u8         reserved_at_5[0x17];
+       u8         reserved_at_5[0x16];
+       u8         ecpf_vport_exists[0x1];
        u8         counter_eswitch_affinity[0x1];
        u8         merged_eswitch[0x1];
        u8         nic_vport_node_guid_modify[0x1];
@@@ -8025,52 -8026,6 +8026,52 @@@ struct mlx5_ifc_ppcnt_reg_bits 
        union mlx5_ifc_eth_cntrs_grp_data_layout_auto_bits counter_set;
  };
  
 +struct mlx5_ifc_mpein_reg_bits {
 +      u8         reserved_at_0[0x2];
 +      u8         depth[0x6];
 +      u8         pcie_index[0x8];
 +      u8         node[0x8];
 +      u8         reserved_at_18[0x8];
 +
 +      u8         capability_mask[0x20];
 +
 +      u8         reserved_at_40[0x8];
 +      u8         link_width_enabled[0x8];
 +      u8         link_speed_enabled[0x10];
 +
 +      u8         lane0_physical_position[0x8];
 +      u8         link_width_active[0x8];
 +      u8         link_speed_active[0x10];
 +
 +      u8         num_of_pfs[0x10];
 +      u8         num_of_vfs[0x10];
 +
 +      u8         bdf0[0x10];
 +      u8         reserved_at_b0[0x10];
 +
 +      u8         max_read_request_size[0x4];
 +      u8         max_payload_size[0x4];
 +      u8         reserved_at_c8[0x5];
 +      u8         pwr_status[0x3];
 +      u8         port_type[0x4];
 +      u8         reserved_at_d4[0xb];
 +      u8         lane_reversal[0x1];
 +
 +      u8         reserved_at_e0[0x14];
 +      u8         pci_power[0xc];
 +
 +      u8         reserved_at_100[0x20];
 +
 +      u8         device_status[0x10];
 +      u8         port_state[0x8];
 +      u8         reserved_at_138[0x8];
 +
 +      u8         reserved_at_140[0x10];
 +      u8         receiver_detect_result[0x10];
 +
 +      u8         reserved_at_160[0x20];
 +};
 +
  struct mlx5_ifc_mpcnt_reg_bits {
        u8         reserved_at_0[0x8];
        u8         pcie_index[0x8];
@@@ -8390,9 -8345,7 +8391,9 @@@ struct mlx5_ifc_pcam_reg_bits 
  };
  
  struct mlx5_ifc_mcam_enhanced_features_bits {
 -      u8         reserved_at_0[0x74];
 +      u8         reserved_at_0[0x6e];
 +      u8         pci_status_and_power[0x1];
 +      u8         reserved_at_6f[0x5];
        u8         mark_tx_action_cnp[0x1];
        u8         mark_tx_action_cqe[0x1];
        u8         dynamic_tx_overflow[0x1];
@@@ -8520,9 -8473,17 +8521,17 @@@ struct mlx5_ifc_pamp_reg_bits 
  struct mlx5_ifc_pcmr_reg_bits {
        u8         reserved_at_0[0x8];
        u8         local_port[0x8];
-       u8         reserved_at_10[0x2e];
+       u8         reserved_at_10[0x10];
+       u8         entropy_force_cap[0x1];
+       u8         entropy_calc_cap[0x1];
+       u8         entropy_gre_calc_cap[0x1];
+       u8         reserved_at_23[0x1b];
        u8         fcs_cap[0x1];
-       u8         reserved_at_3f[0x1f];
+       u8         reserved_at_3f[0x1];
+       u8         entropy_force[0x1];
+       u8         entropy_calc[0x1];
+       u8         entropy_gre_calc[0x1];
+       u8         reserved_at_43[0x1b];
        u8         fcs_chk[0x1];
        u8         reserved_at_5f[0x1];
  };
@@@ -8992,7 -8953,6 +9001,7 @@@ union mlx5_ifc_ports_control_registers_
        struct mlx5_ifc_pmtu_reg_bits pmtu_reg;
        struct mlx5_ifc_ppad_reg_bits ppad_reg;
        struct mlx5_ifc_ppcnt_reg_bits ppcnt_reg;
 +      struct mlx5_ifc_mpein_reg_bits mpein_reg;
        struct mlx5_ifc_mpcnt_reg_bits mpcnt_reg;
        struct mlx5_ifc_pplm_reg_bits pplm_reg;
        struct mlx5_ifc_pplr_reg_bits pplr_reg;