Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 9 May 2019 16:02:46 +0000 (09:02 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 9 May 2019 16:02:46 +0000 (09:02 -0700)
Pull rdma updates from Jason Gunthorpe:
 "This has been a smaller cycle than normal. One new driver was
  accepted, which is unusual, and at least one more driver remains in
  review on the list.

  Summary:

   - Driver fixes for hns, hfi1, nes, rxe, i40iw, mlx5, cxgb4,
     vmw_pvrdma

   - Many patches from MatthewW converting radix tree and IDR users to
     use xarray

   - Introduction of tracepoints to the MAD layer

   - Build large SGLs at the start for DMA mapping and get the driver to
     split them

   - Generally clean SGL handling code throughout the subsystem

   - Support for restricting RDMA devices to net namespaces for
     containers

   - Progress to remove object allocation boilerplate code from drivers

   - Change in how the mlx5 driver shows representor ports linked to VFs

   - mlx5 uapi feature to access the on chip SW ICM memory

   - Add a new driver for 'EFA'. This is HW that supports user space
     packet processing through QPs in Amazon's cloud"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rdma/rdma: (186 commits)
  RDMA/ipoib: Allow user space differentiate between valid dev_port
  IB/core, ipoib: Do not overreact to SM LID change event
  RDMA/device: Don't fire uevent before device is fully initialized
  lib/scatterlist: Remove leftover from sg_page_iter comment
  RDMA/efa: Add driver to Kconfig/Makefile
  RDMA/efa: Add the efa module
  RDMA/efa: Add EFA verbs implementation
  RDMA/efa: Add common command handlers
  RDMA/efa: Implement functions that submit and complete admin commands
  RDMA/efa: Add the ABI definitions
  RDMA/efa: Add the com service API definitions
  RDMA/efa: Add the efa_com.h file
  RDMA/efa: Add the efa.h header file
  RDMA/efa: Add EFA device definitions
  RDMA: Add EFA related definitions
  RDMA/umem: Remove hugetlb flag
  RDMA/bnxt_re: Use core helpers to get aligned DMA address
  RDMA/i40iw: Use core helpers to get aligned DMA address within a supported page size
  RDMA/verbs: Add a DMA iterator to return aligned contiguous memory blocks
  RDMA/umem: Add API to find best driver supported page size in an MR
  ...

251 files changed:
MAINTAINERS
drivers/infiniband/Kconfig
drivers/infiniband/core/addr.c
drivers/infiniband/core/cache.c
drivers/infiniband/core/cm.c
drivers/infiniband/core/cm_msgs.h
drivers/infiniband/core/cma.c
drivers/infiniband/core/core_priv.h
drivers/infiniband/core/cq.c
drivers/infiniband/core/device.c
drivers/infiniband/core/iwcm.c
drivers/infiniband/core/mad.c
drivers/infiniband/core/mad_priv.h
drivers/infiniband/core/multicast.c
drivers/infiniband/core/nldev.c
drivers/infiniband/core/rdma_core.c
drivers/infiniband/core/rdma_core.h
drivers/infiniband/core/sa_query.c
drivers/infiniband/core/sysfs.c
drivers/infiniband/core/ucm.c
drivers/infiniband/core/umem.c
drivers/infiniband/core/umem_odp.c
drivers/infiniband/core/user_mad.c
drivers/infiniband/core/uverbs.h
drivers/infiniband/core/uverbs_cmd.c
drivers/infiniband/core/uverbs_ioctl.c
drivers/infiniband/core/uverbs_main.c
drivers/infiniband/core/uverbs_std_types.c
drivers/infiniband/core/uverbs_std_types_counters.c
drivers/infiniband/core/uverbs_std_types_cq.c
drivers/infiniband/core/uverbs_std_types_dm.c
drivers/infiniband/core/uverbs_std_types_flow_action.c
drivers/infiniband/core/uverbs_std_types_mr.c
drivers/infiniband/core/verbs.c
drivers/infiniband/hw/Makefile
drivers/infiniband/hw/bnxt_re/Kconfig
drivers/infiniband/hw/bnxt_re/ib_verbs.c
drivers/infiniband/hw/bnxt_re/ib_verbs.h
drivers/infiniband/hw/bnxt_re/main.c
drivers/infiniband/hw/bnxt_re/qplib_fp.c
drivers/infiniband/hw/bnxt_re/qplib_fp.h
drivers/infiniband/hw/bnxt_re/qplib_rcfw.c
drivers/infiniband/hw/bnxt_re/qplib_res.c
drivers/infiniband/hw/bnxt_re/qplib_res.h
drivers/infiniband/hw/bnxt_re/qplib_sp.c
drivers/infiniband/hw/bnxt_re/qplib_sp.h
drivers/infiniband/hw/cxgb3/cxio_wr.h
drivers/infiniband/hw/cxgb3/iwch.c
drivers/infiniband/hw/cxgb3/iwch.h
drivers/infiniband/hw/cxgb3/iwch_ev.c
drivers/infiniband/hw/cxgb3/iwch_mem.c
drivers/infiniband/hw/cxgb3/iwch_provider.c
drivers/infiniband/hw/cxgb4/cm.c
drivers/infiniband/hw/cxgb4/cq.c
drivers/infiniband/hw/cxgb4/device.c
drivers/infiniband/hw/cxgb4/ev.c
drivers/infiniband/hw/cxgb4/iw_cxgb4.h
drivers/infiniband/hw/cxgb4/mem.c
drivers/infiniband/hw/cxgb4/provider.c
drivers/infiniband/hw/cxgb4/qp.c
drivers/infiniband/hw/efa/Kconfig [new file with mode: 0644]
drivers/infiniband/hw/efa/Makefile [new file with mode: 0644]
drivers/infiniband/hw/efa/efa.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_admin_cmds_defs.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_admin_defs.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_com.c [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_com.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_com_cmd.c [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_com_cmd.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_common_defs.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_main.c [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_regs_defs.h [new file with mode: 0644]
drivers/infiniband/hw/efa/efa_verbs.c [new file with mode: 0644]
drivers/infiniband/hw/hfi1/chip.c
drivers/infiniband/hw/hfi1/chip.h
drivers/infiniband/hw/hfi1/chip_registers.h
drivers/infiniband/hw/hfi1/common.h
drivers/infiniband/hw/hfi1/debugfs.c
drivers/infiniband/hw/hfi1/driver.c
drivers/infiniband/hw/hfi1/exp_rcv.c
drivers/infiniband/hw/hfi1/hfi.h
drivers/infiniband/hw/hfi1/init.c
drivers/infiniband/hw/hfi1/opfn.h
drivers/infiniband/hw/hfi1/qp.c
drivers/infiniband/hw/hfi1/rc.c
drivers/infiniband/hw/hfi1/rc.h
drivers/infiniband/hw/hfi1/ruc.c
drivers/infiniband/hw/hfi1/tid_rdma.c
drivers/infiniband/hw/hfi1/tid_rdma.h
drivers/infiniband/hw/hfi1/trace_dbg.h
drivers/infiniband/hw/hfi1/trace_tid.h
drivers/infiniband/hw/hfi1/verbs.c
drivers/infiniband/hw/hfi1/verbs.h
drivers/infiniband/hw/hfi1/vnic_main.c
drivers/infiniband/hw/hns/Makefile
drivers/infiniband/hw/hns/hns_roce_ah.c
drivers/infiniband/hw/hns/hns_roce_cmd.h
drivers/infiniband/hw/hns/hns_roce_common.h
drivers/infiniband/hw/hns/hns_roce_cq.c
drivers/infiniband/hw/hns/hns_roce_device.h
drivers/infiniband/hw/hns/hns_roce_hw_v1.c
drivers/infiniband/hw/hns/hns_roce_hw_v1.h
drivers/infiniband/hw/hns/hns_roce_hw_v2.c
drivers/infiniband/hw/hns/hns_roce_hw_v2.h
drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c [new file with mode: 0644]
drivers/infiniband/hw/hns/hns_roce_main.c
drivers/infiniband/hw/hns/hns_roce_mr.c
drivers/infiniband/hw/hns/hns_roce_pd.c
drivers/infiniband/hw/hns/hns_roce_qp.c
drivers/infiniband/hw/hns/hns_roce_restrack.c [new file with mode: 0644]
drivers/infiniband/hw/hns/hns_roce_srq.c
drivers/infiniband/hw/i40iw/i40iw.h
drivers/infiniband/hw/i40iw/i40iw_cm.c
drivers/infiniband/hw/i40iw/i40iw_main.c
drivers/infiniband/hw/i40iw/i40iw_verbs.c
drivers/infiniband/hw/i40iw/i40iw_verbs.h
drivers/infiniband/hw/mlx4/ah.c
drivers/infiniband/hw/mlx4/cm.c
drivers/infiniband/hw/mlx4/cq.c
drivers/infiniband/hw/mlx4/doorbell.c
drivers/infiniband/hw/mlx4/mad.c
drivers/infiniband/hw/mlx4/main.c
drivers/infiniband/hw/mlx4/mlx4_ib.h
drivers/infiniband/hw/mlx4/mr.c
drivers/infiniband/hw/mlx4/qp.c
drivers/infiniband/hw/mlx4/srq.c
drivers/infiniband/hw/mlx5/ah.c
drivers/infiniband/hw/mlx5/cmd.c
drivers/infiniband/hw/mlx5/cmd.h
drivers/infiniband/hw/mlx5/cq.c
drivers/infiniband/hw/mlx5/devx.c
drivers/infiniband/hw/mlx5/flow.c
drivers/infiniband/hw/mlx5/ib_rep.c
drivers/infiniband/hw/mlx5/ib_rep.h
drivers/infiniband/hw/mlx5/main.c
drivers/infiniband/hw/mlx5/mlx5_ib.h
drivers/infiniband/hw/mlx5/mr.c
drivers/infiniband/hw/mlx5/odp.c
drivers/infiniband/hw/mlx5/qp.c
drivers/infiniband/hw/mlx5/srq.c
drivers/infiniband/hw/mlx5/srq.h
drivers/infiniband/hw/mlx5/srq_cmd.c
drivers/infiniband/hw/mthca/mthca_cq.c
drivers/infiniband/hw/mthca/mthca_eq.c
drivers/infiniband/hw/mthca/mthca_mr.c
drivers/infiniband/hw/mthca/mthca_provider.c
drivers/infiniband/hw/mthca/mthca_qp.c
drivers/infiniband/hw/nes/nes_cm.c
drivers/infiniband/hw/nes/nes_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_ah.c
drivers/infiniband/hw/ocrdma/ocrdma_ah.h
drivers/infiniband/hw/ocrdma/ocrdma_hw.c
drivers/infiniband/hw/ocrdma/ocrdma_hw.h
drivers/infiniband/hw/ocrdma/ocrdma_main.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
drivers/infiniband/hw/qedr/main.c
drivers/infiniband/hw/qedr/qedr.h
drivers/infiniband/hw/qedr/qedr_iw_cm.c
drivers/infiniband/hw/qedr/qedr_roce_cm.c
drivers/infiniband/hw/qedr/verbs.c
drivers/infiniband/hw/qedr/verbs.h
drivers/infiniband/hw/qib/qib.h
drivers/infiniband/hw/qib/qib_common.h
drivers/infiniband/hw/qib/qib_driver.c
drivers/infiniband/hw/qib/qib_fs.c
drivers/infiniband/hw/qib/qib_iba7322.c
drivers/infiniband/hw/qib/qib_init.c
drivers/infiniband/hw/qib/qib_rc.c
drivers/infiniband/hw/qib/qib_user_sdma.c
drivers/infiniband/hw/qib/qib_verbs.h
drivers/infiniband/hw/usnic/usnic_ib_verbs.c
drivers/infiniband/hw/usnic/usnic_ib_verbs.h
drivers/infiniband/hw/usnic/usnic_uiom.c
drivers/infiniband/hw/usnic/usnic_uiom.h
drivers/infiniband/hw/vmw_pvrdma/pvrdma_cq.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_main.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_mr.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_qp.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_srq.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.c
drivers/infiniband/hw/vmw_pvrdma/pvrdma_verbs.h
drivers/infiniband/sw/rdmavt/ah.c
drivers/infiniband/sw/rdmavt/ah.h
drivers/infiniband/sw/rdmavt/cq.c
drivers/infiniband/sw/rdmavt/cq.h
drivers/infiniband/sw/rdmavt/mmap.c
drivers/infiniband/sw/rdmavt/mmap.h
drivers/infiniband/sw/rdmavt/mr.c
drivers/infiniband/sw/rdmavt/mr.h
drivers/infiniband/sw/rdmavt/pd.c
drivers/infiniband/sw/rdmavt/pd.h
drivers/infiniband/sw/rdmavt/qp.c
drivers/infiniband/sw/rdmavt/qp.h
drivers/infiniband/sw/rdmavt/rc.c
drivers/infiniband/sw/rdmavt/srq.c
drivers/infiniband/sw/rdmavt/srq.h
drivers/infiniband/sw/rdmavt/trace_qp.h
drivers/infiniband/sw/rdmavt/trace_rc.h
drivers/infiniband/sw/rdmavt/trace_tx.h
drivers/infiniband/sw/rdmavt/vt.c
drivers/infiniband/sw/rxe/rxe_cq.c
drivers/infiniband/sw/rxe/rxe_hdr.h
drivers/infiniband/sw/rxe/rxe_loc.h
drivers/infiniband/sw/rxe/rxe_mmap.c
drivers/infiniband/sw/rxe/rxe_mr.c
drivers/infiniband/sw/rxe/rxe_net.c
drivers/infiniband/sw/rxe/rxe_pool.c
drivers/infiniband/sw/rxe/rxe_qp.c
drivers/infiniband/sw/rxe/rxe_queue.c
drivers/infiniband/sw/rxe/rxe_queue.h
drivers/infiniband/sw/rxe/rxe_srq.c
drivers/infiniband/sw/rxe/rxe_verbs.c
drivers/infiniband/sw/rxe/rxe_verbs.h
drivers/infiniband/ulp/ipoib/ipoib_main.c
drivers/infiniband/ulp/ipoib/ipoib_verbs.c
drivers/infiniband/ulp/iser/Kconfig
drivers/infiniband/ulp/iser/iscsi_iser.c
drivers/infiniband/ulp/iser/iscsi_iser.h
drivers/infiniband/ulp/opa_vnic/opa_vnic_vema.c
drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
include/linux/dynamic_debug.h
include/linux/mlx5/driver.h
include/linux/overflow.h
include/linux/scatterlist.h
include/rdma/ib_cache.h
include/rdma/ib_mad.h
include/rdma/ib_smi.h
include/rdma/ib_umem.h
include/rdma/ib_umem_odp.h
include/rdma/ib_verbs.h
include/rdma/iw_cm.h
include/rdma/opa_port_info.h
include/rdma/opa_smi.h
include/rdma/rdma_vt.h
include/rdma/rdmavt_qp.h
include/rdma/uverbs_std_types.h
include/rdma/uverbs_types.h
include/trace/events/ib_mad.h [new file with mode: 0644]
include/trace/events/ib_umad.h [new file with mode: 0644]
include/uapi/rdma/efa-abi.h [new file with mode: 0644]
include/uapi/rdma/mlx5-abi.h
include/uapi/rdma/mlx5_user_ioctl_cmds.h
include/uapi/rdma/mlx5_user_ioctl_verbs.h
include/uapi/rdma/rdma_netlink.h
include/uapi/rdma/rdma_user_ioctl_cmds.h
lib/dynamic_debug.c
net/smc/smc_ib.c
samples/bpf/Makefile
samples/bpf/ibumad_kern.c [new file with mode: 0644]
samples/bpf/ibumad_user.c [new file with mode: 0644]

index a225661..ae4063e 100644 (file)
@@ -745,6 +745,15 @@ S: Supported
 F:     Documentation/networking/device_drivers/amazon/ena.txt
 F:     drivers/net/ethernet/amazon/
 
+AMAZON RDMA EFA DRIVER
+M:     Gal Pressman <galpress@amazon.com>
+R:     Yossi Leybovich <sleybo@amazon.com>
+L:     linux-rdma@vger.kernel.org
+Q:     https://patchwork.kernel.org/project/linux-rdma/list/
+S:     Supported
+F:     drivers/infiniband/hw/efa/
+F:     include/uapi/rdma/efa-abi.h
+
 AMD CRYPTOGRAPHIC COPROCESSOR (CCP) DRIVER
 M:     Tom Lendacky <thomas.lendacky@amd.com>
 M:     Gary Hook <gary.hook@amd.com>
@@ -4279,7 +4288,7 @@ S:        Supported
 F:     drivers/scsi/cxgbi/cxgb3i
 
 CXGB3 IWARP RNIC DRIVER (IW_CXGB3)
-M:     Steve Wise <swise@chelsio.com>
+M:     Potnuri Bharat Teja <bharat@chelsio.com>
 L:     linux-rdma@vger.kernel.org
 W:     http://www.openfabrics.org
 S:     Supported
@@ -4308,7 +4317,7 @@ S:        Supported
 F:     drivers/scsi/cxgbi/cxgb4i
 
 CXGB4 IWARP RNIC DRIVER (IW_CXGB4)
-M:     Steve Wise <swise@chelsio.com>
+M:     Potnuri Bharat Teja <bharat@chelsio.com>
 L:     linux-rdma@vger.kernel.org
 W:     http://www.openfabrics.org
 S:     Supported
@@ -7727,6 +7736,10 @@ F:       drivers/infiniband/
 F:     include/uapi/linux/if_infiniband.h
 F:     include/uapi/rdma/
 F:     include/rdma/
+F:     include/trace/events/ib_mad.h
+F:     include/trace/events/ib_umad.h
+F:     samples/bpf/ibumad_kern.c
+F:     samples/bpf/ibumad_user.c
 
 INGENIC JZ4780 DMA Driver
 M:     Zubair Lutfullah Kakakhel <Zubair.Kakakhel@imgtec.com>
index d318bab..cbfbea4 100644 (file)
@@ -93,6 +93,7 @@ source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/efa/Kconfig"
 source "drivers/infiniband/hw/i40iw/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
index 744b6ec..ba01b90 100644 (file)
@@ -45,6 +45,7 @@
 #include <net/ipv6_stubs.h>
 #include <net/ip6_route.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 #include <rdma/ib_sa.h>
 #include <rdma/ib.h>
 #include <rdma/rdma_netlink.h>
index 43c67e5..18e476b 100644 (file)
@@ -78,11 +78,22 @@ enum gid_table_entry_state {
        GID_TABLE_ENTRY_PENDING_DEL     = 3,
 };
 
+struct roce_gid_ndev_storage {
+       struct rcu_head rcu_head;
+       struct net_device *ndev;
+};
+
 struct ib_gid_table_entry {
        struct kref                     kref;
        struct work_struct              del_work;
        struct ib_gid_attr              attr;
        void                            *context;
+       /* Store the ndev pointer to release reference later on in
+        * call_rcu context because by that time gid_table_entry
+        * and attr might be already freed. So keep a copy of it.
+        * ndev_storage is freed by rcu callback.
+        */
+       struct roce_gid_ndev_storage    *ndev_storage;
        enum gid_table_entry_state      state;
 };
 
@@ -206,6 +217,20 @@ static void schedule_free_gid(struct kref *kref)
        queue_work(ib_wq, &entry->del_work);
 }
 
+static void put_gid_ndev(struct rcu_head *head)
+{
+       struct roce_gid_ndev_storage *storage =
+               container_of(head, struct roce_gid_ndev_storage, rcu_head);
+
+       WARN_ON(!storage->ndev);
+       /* At this point its safe to release netdev reference,
+        * as all callers working on gid_attr->ndev are done
+        * using this netdev.
+        */
+       dev_put(storage->ndev);
+       kfree(storage);
+}
+
 static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
 {
        struct ib_device *device = entry->attr.device;
@@ -228,8 +253,8 @@ static void free_gid_entry_locked(struct ib_gid_table_entry *entry)
        /* Now this index is ready to be allocated */
        write_unlock_irq(&table->rwlock);
 
-       if (entry->attr.ndev)
-               dev_put(entry->attr.ndev);
+       if (entry->ndev_storage)
+               call_rcu(&entry->ndev_storage->rcu_head, put_gid_ndev);
        kfree(entry);
 }
 
@@ -266,14 +291,25 @@ static struct ib_gid_table_entry *
 alloc_gid_entry(const struct ib_gid_attr *attr)
 {
        struct ib_gid_table_entry *entry;
+       struct net_device *ndev;
 
        entry = kzalloc(sizeof(*entry), GFP_KERNEL);
        if (!entry)
                return NULL;
+
+       ndev = rcu_dereference_protected(attr->ndev, 1);
+       if (ndev) {
+               entry->ndev_storage = kzalloc(sizeof(*entry->ndev_storage),
+                                             GFP_KERNEL);
+               if (!entry->ndev_storage) {
+                       kfree(entry);
+                       return NULL;
+               }
+               dev_hold(ndev);
+               entry->ndev_storage->ndev = ndev;
+       }
        kref_init(&entry->kref);
        memcpy(&entry->attr, attr, sizeof(*attr));
-       if (entry->attr.ndev)
-               dev_hold(entry->attr.ndev);
        INIT_WORK(&entry->del_work, free_gid_work);
        entry->state = GID_TABLE_ENTRY_INVALID;
        return entry;
@@ -343,6 +379,7 @@ static int add_roce_gid(struct ib_gid_table_entry *entry)
 static void del_gid(struct ib_device *ib_dev, u8 port,
                    struct ib_gid_table *table, int ix)
 {
+       struct roce_gid_ndev_storage *ndev_storage;
        struct ib_gid_table_entry *entry;
 
        lockdep_assert_held(&table->lock);
@@ -360,6 +397,13 @@ static void del_gid(struct ib_device *ib_dev, u8 port,
                table->data_vec[ix] = NULL;
        write_unlock_irq(&table->rwlock);
 
+       ndev_storage = entry->ndev_storage;
+       if (ndev_storage) {
+               entry->ndev_storage = NULL;
+               rcu_assign_pointer(entry->attr.ndev, NULL);
+               call_rcu(&ndev_storage->rcu_head, put_gid_ndev);
+       }
+
        if (rdma_cap_roce_gid_table(ib_dev, port))
                ib_dev->ops.del_gid(&entry->attr, &entry->context);
 
@@ -543,30 +587,11 @@ out_unlock:
 int ib_cache_gid_add(struct ib_device *ib_dev, u8 port,
                     union ib_gid *gid, struct ib_gid_attr *attr)
 {
-       struct net_device *idev;
-       unsigned long mask;
-       int ret;
-
-       idev = ib_device_get_netdev(ib_dev, port);
-       if (idev && attr->ndev != idev) {
-               union ib_gid default_gid;
-
-               /* Adding default GIDs is not permitted */
-               make_default_gid(idev, &default_gid);
-               if (!memcmp(gid, &default_gid, sizeof(*gid))) {
-                       dev_put(idev);
-                       return -EPERM;
-               }
-       }
-       if (idev)
-               dev_put(idev);
-
-       mask = GID_ATTR_FIND_MASK_GID |
-              GID_ATTR_FIND_MASK_GID_TYPE |
-              GID_ATTR_FIND_MASK_NETDEV;
+       unsigned long mask = GID_ATTR_FIND_MASK_GID |
+                            GID_ATTR_FIND_MASK_GID_TYPE |
+                            GID_ATTR_FIND_MASK_NETDEV;
 
-       ret = __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
-       return ret;
+       return __ib_cache_gid_add(ib_dev, port, gid, attr, mask, false);
 }
 
 static int
@@ -1263,11 +1288,72 @@ struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr)
 
        read_lock_irqsave(&table->rwlock, flags);
        valid = is_gid_entry_valid(table->data_vec[attr->index]);
-       if (valid && attr->ndev && (READ_ONCE(attr->ndev->flags) & IFF_UP))
-               ndev = attr->ndev;
+       if (valid) {
+               ndev = rcu_dereference(attr->ndev);
+               if (!ndev ||
+                   (ndev && ((READ_ONCE(ndev->flags) & IFF_UP) == 0)))
+                       ndev = ERR_PTR(-ENODEV);
+       }
        read_unlock_irqrestore(&table->rwlock, flags);
        return ndev;
 }
+EXPORT_SYMBOL(rdma_read_gid_attr_ndev_rcu);
+
+static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
+{
+       u16 *vlan_id = data;
+
+       if (is_vlan_dev(lower_dev))
+               *vlan_id = vlan_dev_vlan_id(lower_dev);
+
+       /* We are interested only in first level vlan device, so
+        * always return 1 to stop iterating over next level devices.
+        */
+       return 1;
+}
+
+/**
+ * rdma_read_gid_l2_fields - Read the vlan ID and source MAC address
+ *                          of a GID entry.
+ *
+ * @attr:      GID attribute pointer whose L2 fields to be read
+ * @vlan_id:   Pointer to vlan id to fill up if the GID entry has
+ *             vlan id. It is optional.
+ * @smac:      Pointer to smac to fill up for a GID entry. It is optional.
+ *
+ * rdma_read_gid_l2_fields() returns 0 on success and returns vlan id
+ * (if gid entry has vlan) and source MAC, or returns error.
+ */
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+                           u16 *vlan_id, u8 *smac)
+{
+       struct net_device *ndev;
+
+       rcu_read_lock();
+       ndev = rcu_dereference(attr->ndev);
+       if (!ndev) {
+               rcu_read_unlock();
+               return -ENODEV;
+       }
+       if (smac)
+               ether_addr_copy(smac, ndev->dev_addr);
+       if (vlan_id) {
+               *vlan_id = 0xffff;
+               if (is_vlan_dev(ndev)) {
+                       *vlan_id = vlan_dev_vlan_id(ndev);
+               } else {
+                       /* If the netdev is upper device and if it's lower
+                        * device is vlan device, consider vlan id of the
+                        * the lower vlan device for this gid entry.
+                        */
+                       netdev_walk_all_lower_dev_rcu(attr->ndev,
+                                       get_lower_dev_vlan, vlan_id);
+               }
+       }
+       rcu_read_unlock();
+       return 0;
+}
+EXPORT_SYMBOL(rdma_read_gid_l2_fields);
 
 static int config_non_roce_gid_cache(struct ib_device *device,
                                     u8 port, int gid_tbl_len)
@@ -1392,7 +1478,6 @@ static void ib_cache_event(struct ib_event_handler *handler,
            event->event == IB_EVENT_PORT_ACTIVE ||
            event->event == IB_EVENT_LID_CHANGE  ||
            event->event == IB_EVENT_PKEY_CHANGE ||
-           event->event == IB_EVENT_SM_CHANGE   ||
            event->event == IB_EVENT_CLIENT_REREGISTER ||
            event->event == IB_EVENT_GID_CHANGE) {
                work = kmalloc(sizeof *work, GFP_ATOMIC);
index b9416a6..da10e6c 100644 (file)
@@ -52,6 +52,7 @@
 #include <rdma/ib_cache.h>
 #include <rdma/ib_cm.h>
 #include "cm_msgs.h"
+#include "core_priv.h"
 
 MODULE_AUTHOR("Sean Hefty");
 MODULE_DESCRIPTION("InfiniBand CM");
@@ -124,7 +125,8 @@ static struct ib_cm {
        struct rb_root remote_qp_table;
        struct rb_root remote_id_table;
        struct rb_root remote_sidr_table;
-       struct idr local_id_table;
+       struct xarray local_id_table;
+       u32 local_id_next;
        __be32 random_id_operand;
        struct list_head timewait_list;
        struct workqueue_struct *wq;
@@ -219,7 +221,6 @@ struct cm_port {
 struct cm_device {
        struct list_head list;
        struct ib_device *ib_device;
-       struct device *device;
        u8 ack_delay;
        int going_down;
        struct cm_port *port[0];
@@ -598,35 +599,31 @@ static int cm_init_av_by_path(struct sa_path_rec *path,
 
 static int cm_alloc_id(struct cm_id_private *cm_id_priv)
 {
-       unsigned long flags;
-       int id;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&cm.lock, flags);
+       int err;
+       u32 id;
 
-       id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
-
-       spin_unlock_irqrestore(&cm.lock, flags);
-       idr_preload_end();
+       err = xa_alloc_cyclic_irq(&cm.local_id_table, &id, cm_id_priv,
+                       xa_limit_32b, &cm.local_id_next, GFP_KERNEL);
 
        cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
-       return id < 0 ? id : 0;
+       return err;
+}
+
+static u32 cm_local_id(__be32 local_id)
+{
+       return (__force u32) (local_id ^ cm.random_id_operand);
 }
 
 static void cm_free_id(__be32 local_id)
 {
-       spin_lock_irq(&cm.lock);
-       idr_remove(&cm.local_id_table,
-                  (__force int) (local_id ^ cm.random_id_operand));
-       spin_unlock_irq(&cm.lock);
+       xa_erase_irq(&cm.local_id_table, cm_local_id(local_id));
 }
 
 static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
 {
        struct cm_id_private *cm_id_priv;
 
-       cm_id_priv = idr_find(&cm.local_id_table,
-                             (__force int) (local_id ^ cm.random_id_operand));
+       cm_id_priv = xa_load(&cm.local_id_table, cm_local_id(local_id));
        if (cm_id_priv) {
                if (cm_id_priv->id.remote_id == remote_id)
                        atomic_inc(&cm_id_priv->refcount);
@@ -1988,11 +1985,12 @@ static int cm_req_handler(struct cm_work *work)
        grh = rdma_ah_read_grh(&cm_id_priv->av.ah_attr);
        gid_attr = grh->sgid_attr;
 
-       if (gid_attr && gid_attr->ndev) {
+       if (gid_attr &&
+           rdma_protocol_roce(work->port->cm_dev->ib_device,
+                              work->port->port_num)) {
                work->path[0].rec_type =
                        sa_conv_gid_to_pathrec_type(gid_attr->gid_type);
        } else {
-               /* If no GID attribute or ndev is null, it is not RoCE. */
                cm_path_set_rec_type(work->port->cm_dev->ib_device,
                                     work->port->port_num,
                                     &work->path[0],
@@ -2824,9 +2822,8 @@ static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
                        spin_unlock_irq(&cm.lock);
                        return NULL;
                }
-               cm_id_priv = idr_find(&cm.local_id_table, (__force int)
-                                     (timewait_info->work.local_id ^
-                                      cm.random_id_operand));
+               cm_id_priv = xa_load(&cm.local_id_table,
+                               cm_local_id(timewait_info->work.local_id));
                if (cm_id_priv) {
                        if (cm_id_priv->id.remote_id == remote_id)
                                atomic_inc(&cm_id_priv->refcount);
@@ -4276,18 +4273,6 @@ static struct kobj_type cm_counter_obj_type = {
        .default_attrs = cm_counter_default_attrs
 };
 
-static void cm_release_port_obj(struct kobject *obj)
-{
-       struct cm_port *cm_port;
-
-       cm_port = container_of(obj, struct cm_port, port_obj);
-       kfree(cm_port);
-}
-
-static struct kobj_type cm_port_obj_type = {
-       .release = cm_release_port_obj
-};
-
 static char *cm_devnode(struct device *dev, umode_t *mode)
 {
        if (mode)
@@ -4306,19 +4291,12 @@ static int cm_create_port_fs(struct cm_port *port)
 {
        int i, ret;
 
-       ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
-                                  &port->cm_dev->device->kobj,
-                                  "%d", port->port_num);
-       if (ret) {
-               kfree(port);
-               return ret;
-       }
-
        for (i = 0; i < CM_COUNTER_GROUPS; i++) {
-               ret = kobject_init_and_add(&port->counter_group[i].obj,
-                                          &cm_counter_obj_type,
-                                          &port->port_obj,
-                                          "%s", counter_group_names[i]);
+               ret = ib_port_register_module_stat(port->cm_dev->ib_device,
+                                                  port->port_num,
+                                                  &port->counter_group[i].obj,
+                                                  &cm_counter_obj_type,
+                                                  counter_group_names[i]);
                if (ret)
                        goto error;
        }
@@ -4327,8 +4305,7 @@ static int cm_create_port_fs(struct cm_port *port)
 
 error:
        while (i--)
-               kobject_put(&port->counter_group[i].obj);
-       kobject_put(&port->port_obj);
+               ib_port_unregister_module_stat(&port->counter_group[i].obj);
        return ret;
 
 }
@@ -4338,9 +4315,8 @@ static void cm_remove_port_fs(struct cm_port *port)
        int i;
 
        for (i = 0; i < CM_COUNTER_GROUPS; i++)
-               kobject_put(&port->counter_group[i].obj);
+               ib_port_unregister_module_stat(&port->counter_group[i].obj);
 
-       kobject_put(&port->port_obj);
 }
 
 static void cm_add_one(struct ib_device *ib_device)
@@ -4367,13 +4343,6 @@ static void cm_add_one(struct ib_device *ib_device)
        cm_dev->ib_device = ib_device;
        cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay;
        cm_dev->going_down = 0;
-       cm_dev->device = device_create(&cm_class, &ib_device->dev,
-                                      MKDEV(0, 0), NULL,
-                                      "%s", dev_name(&ib_device->dev));
-       if (IS_ERR(cm_dev->device)) {
-               kfree(cm_dev);
-               return;
-       }
 
        set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
        for (i = 1; i <= ib_device->phys_port_cnt; i++) {
@@ -4440,7 +4409,6 @@ error1:
                cm_remove_port_fs(port);
        }
 free:
-       device_unregister(cm_dev->device);
        kfree(cm_dev);
 }
 
@@ -4494,7 +4462,6 @@ static void cm_remove_one(struct ib_device *ib_device, void *client_data)
                cm_remove_port_fs(port);
        }
 
-       device_unregister(cm_dev->device);
        kfree(cm_dev);
 }
 
@@ -4502,7 +4469,6 @@ static int __init ib_cm_init(void)
 {
        int ret;
 
-       memset(&cm, 0, sizeof cm);
        INIT_LIST_HEAD(&cm.device_list);
        rwlock_init(&cm.device_lock);
        spin_lock_init(&cm.lock);
@@ -4512,7 +4478,7 @@ static int __init ib_cm_init(void)
        cm.remote_id_table = RB_ROOT;
        cm.remote_qp_table = RB_ROOT;
        cm.remote_sidr_table = RB_ROOT;
-       idr_init(&cm.local_id_table);
+       xa_init_flags(&cm.local_id_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
        get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
        INIT_LIST_HEAD(&cm.timewait_list);
 
@@ -4538,7 +4504,6 @@ error3:
 error2:
        class_unregister(&cm_class);
 error1:
-       idr_destroy(&cm.local_id_table);
        return ret;
 }
 
@@ -4560,9 +4525,8 @@ static void __exit ib_cm_cleanup(void)
        }
 
        class_unregister(&cm_class);
-       idr_destroy(&cm.local_id_table);
+       WARN_ON(!xa_empty(&cm.local_id_table));
 }
 
 module_init(ib_cm_init);
 module_exit(ib_cm_cleanup);
-
index 476d430..3d16d61 100644 (file)
@@ -98,7 +98,7 @@ struct cm_req_msg {
 
        u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
 {
@@ -423,7 +423,7 @@ enum cm_msg_response {
 
        u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
 {
@@ -461,7 +461,7 @@ struct cm_rej_msg {
 
        u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
 {
@@ -506,7 +506,7 @@ struct cm_rep_msg {
 
        u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
 {
@@ -614,7 +614,7 @@ struct cm_rtu_msg {
 
        u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_dreq_msg {
        struct ib_mad_hdr hdr;
@@ -626,7 +626,7 @@ struct cm_dreq_msg {
 
        u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
 {
@@ -647,7 +647,7 @@ struct cm_drep_msg {
 
        u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
 
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_lap_msg {
        struct ib_mad_hdr hdr;
@@ -675,7 +675,7 @@ struct cm_lap_msg {
        u8 offset63;
 
        u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
-} __attribute__  ((packed));
+} __packed;
 
 static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
 {
@@ -784,7 +784,7 @@ struct cm_apr_msg {
        u8 info[IB_CM_APR_INFO_LENGTH];
 
        u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_sidr_req_msg {
        struct ib_mad_hdr hdr;
@@ -795,7 +795,7 @@ struct cm_sidr_req_msg {
        __be64 service_id;
 
        u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
-} __attribute__ ((packed));
+} __packed;
 
 struct cm_sidr_rep_msg {
        struct ib_mad_hdr hdr;
@@ -811,7 +811,7 @@ struct cm_sidr_rep_msg {
        u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
 
        u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
-} __attribute__ ((packed));
+} __packed;
 
 static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
 {
index 68c997b..19f1730 100644 (file)
@@ -39,7 +39,7 @@
 #include <linux/mutex.h>
 #include <linux/random.h>
 #include <linux/igmp.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/inetdevice.h>
 #include <linux/slab.h>
 #include <linux/module.h>
@@ -191,10 +191,10 @@ static struct workqueue_struct *cma_wq;
 static unsigned int cma_pernet_id;
 
 struct cma_pernet {
-       struct idr tcp_ps;
-       struct idr udp_ps;
-       struct idr ipoib_ps;
-       struct idr ib_ps;
+       struct xarray tcp_ps;
+       struct xarray udp_ps;
+       struct xarray ipoib_ps;
+       struct xarray ib_ps;
 };
 
 static struct cma_pernet *cma_pernet(struct net *net)
@@ -202,7 +202,8 @@ static struct cma_pernet *cma_pernet(struct net *net)
        return net_generic(net, cma_pernet_id);
 }
 
-static struct idr *cma_pernet_idr(struct net *net, enum rdma_ucm_port_space ps)
+static
+struct xarray *cma_pernet_xa(struct net *net, enum rdma_ucm_port_space ps)
 {
        struct cma_pernet *pernet = cma_pernet(net);
 
@@ -247,25 +248,25 @@ struct class_port_info_context {
 static int cma_ps_alloc(struct net *net, enum rdma_ucm_port_space ps,
                        struct rdma_bind_list *bind_list, int snum)
 {
-       struct idr *idr = cma_pernet_idr(net, ps);
+       struct xarray *xa = cma_pernet_xa(net, ps);
 
-       return idr_alloc(idr, bind_list, snum, snum + 1, GFP_KERNEL);
+       return xa_insert(xa, snum, bind_list, GFP_KERNEL);
 }
 
 static struct rdma_bind_list *cma_ps_find(struct net *net,
                                          enum rdma_ucm_port_space ps, int snum)
 {
-       struct idr *idr = cma_pernet_idr(net, ps);
+       struct xarray *xa = cma_pernet_xa(net, ps);
 
-       return idr_find(idr, snum);
+       return xa_load(xa, snum);
 }
 
 static void cma_ps_remove(struct net *net, enum rdma_ucm_port_space ps,
                          int snum)
 {
-       struct idr *idr = cma_pernet_idr(net, ps);
+       struct xarray *xa = cma_pernet_xa(net, ps);
 
-       idr_remove(idr, snum);
+       xa_erase(xa, snum);
 }
 
 enum {
@@ -615,6 +616,9 @@ cma_validate_port(struct ib_device *device, u8 port,
        int dev_type = dev_addr->dev_type;
        struct net_device *ndev = NULL;
 
+       if (!rdma_dev_access_netns(device, id_priv->id.route.addr.dev_addr.net))
+               return ERR_PTR(-ENODEV);
+
        if ((dev_type == ARPHRD_INFINIBAND) && !rdma_protocol_ib(device, port))
                return ERR_PTR(-ENODEV);
 
@@ -1173,18 +1177,31 @@ static inline bool cma_any_addr(const struct sockaddr *addr)
        return cma_zero_addr(addr) || cma_loopback_addr(addr);
 }
 
-static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+static int cma_addr_cmp(const struct sockaddr *src, const struct sockaddr *dst)
 {
        if (src->sa_family != dst->sa_family)
                return -1;
 
        switch (src->sa_family) {
        case AF_INET:
-               return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
-                      ((struct sockaddr_in *) dst)->sin_addr.s_addr;
-       case AF_INET6:
-               return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
-                                    &((struct sockaddr_in6 *) dst)->sin6_addr);
+               return ((struct sockaddr_in *)src)->sin_addr.s_addr !=
+                      ((struct sockaddr_in *)dst)->sin_addr.s_addr;
+       case AF_INET6: {
+               struct sockaddr_in6 *src_addr6 = (struct sockaddr_in6 *)src;
+               struct sockaddr_in6 *dst_addr6 = (struct sockaddr_in6 *)dst;
+               bool link_local;
+
+               if (ipv6_addr_cmp(&src_addr6->sin6_addr,
+                                         &dst_addr6->sin6_addr))
+                       return 1;
+               link_local = ipv6_addr_type(&dst_addr6->sin6_addr) &
+                            IPV6_ADDR_LINKLOCAL;
+               /* Link local must match their scope_ids */
+               return link_local ? (src_addr6->sin6_scope_id !=
+                                    dst_addr6->sin6_scope_id) :
+                                   0;
+       }
+
        default:
                return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
                                   &((struct sockaddr_ib *) dst)->sib_addr);
@@ -1469,6 +1486,7 @@ static struct net_device *
 roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
 {
        const struct ib_gid_attr *sgid_attr = NULL;
+       struct net_device *ndev;
 
        if (ib_event->event == IB_CM_REQ_RECEIVED)
                sgid_attr = ib_event->param.req_rcvd.ppath_sgid_attr;
@@ -1477,8 +1495,15 @@ roce_get_net_dev_by_cm_event(const struct ib_cm_event *ib_event)
 
        if (!sgid_attr)
                return NULL;
-       dev_hold(sgid_attr->ndev);
-       return sgid_attr->ndev;
+
+       rcu_read_lock();
+       ndev = rdma_read_gid_attr_ndev_rcu(sgid_attr);
+       if (IS_ERR(ndev))
+               ndev = NULL;
+       else
+               dev_hold(ndev);
+       rcu_read_unlock();
+       return ndev;
 }
 
 static struct net_device *cma_get_net_dev(const struct ib_cm_event *ib_event,
@@ -3247,7 +3272,7 @@ static int cma_alloc_port(enum rdma_ucm_port_space ps,
                goto err;
 
        bind_list->ps = ps;
-       bind_list->port = (unsigned short)ret;
+       bind_list->port = snum;
        cma_bind_port(bind_list, id_priv);
        return 0;
 err:
@@ -4655,10 +4680,10 @@ static int cma_init_net(struct net *net)
 {
        struct cma_pernet *pernet = cma_pernet(net);
 
-       idr_init(&pernet->tcp_ps);
-       idr_init(&pernet->udp_ps);
-       idr_init(&pernet->ipoib_ps);
-       idr_init(&pernet->ib_ps);
+       xa_init(&pernet->tcp_ps);
+       xa_init(&pernet->udp_ps);
+       xa_init(&pernet->ipoib_ps);
+       xa_init(&pernet->ib_ps);
 
        return 0;
 }
@@ -4667,10 +4692,10 @@ static void cma_exit_net(struct net *net)
 {
        struct cma_pernet *pernet = cma_pernet(net);
 
-       idr_destroy(&pernet->tcp_ps);
-       idr_destroy(&pernet->udp_ps);
-       idr_destroy(&pernet->ipoib_ps);
-       idr_destroy(&pernet->ib_ps);
+       WARN_ON(!xa_empty(&pernet->tcp_ps));
+       WARN_ON(!xa_empty(&pernet->udp_ps));
+       WARN_ON(!xa_empty(&pernet->ipoib_ps));
+       WARN_ON(!xa_empty(&pernet->ib_ps));
 }
 
 static struct pernet_operations cma_pernet_operations = {
index 08c6902..ff40a45 100644 (file)
@@ -55,6 +55,7 @@ struct pkey_index_qp_list {
 };
 
 extern const struct attribute_group ib_dev_attr_group;
+extern bool ib_devices_shared_netns;
 
 int ib_device_register_sysfs(struct ib_device *device);
 void ib_device_unregister_sysfs(struct ib_device *device);
@@ -279,7 +280,8 @@ static inline void ib_mad_agent_security_change(void)
 }
 #endif
 
-struct ib_device *ib_device_get_by_index(u32 ifindex);
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index);
+
 /* RDMA device netlink */
 void nldev_init(void);
 void nldev_exit(void);
@@ -302,6 +304,7 @@ static inline struct ib_qp *_ib_create_qp(struct ib_device *dev,
        qp->device = dev;
        qp->pd = pd;
        qp->uobject = uobj;
+       qp->real_qp = qp;
        /*
         * We don't track XRC QPs for now, because they don't have PD
         * and more importantly they are created internaly by driver,
@@ -336,4 +339,17 @@ int roce_resolve_route_from_path(struct sa_path_rec *rec,
                                 const struct ib_gid_attr *attr);
 
 struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
+void ib_free_port_attrs(struct ib_core_device *coredev);
+int ib_setup_port_attrs(struct ib_core_device *coredev);
+
+int rdma_compatdev_set(u8 enable);
+
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+                                struct kobject *kobj, struct kobj_type *ktype,
+                                const char *name);
+void ib_port_unregister_module_stat(struct kobject *kobj);
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+                           struct ib_device *dev, u32 ns_fd);
 #endif /* _CORE_PRIV_H */
index d61e5e1..a4c8199 100644 (file)
@@ -128,15 +128,17 @@ static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private)
  * @comp_vector:       HCA completion vectors for this CQ
  * @poll_ctx:          context to poll the CQ from.
  * @caller:            module owner name.
+ * @udata:             Valid user data or NULL for kernel object
  *
  * This is the proper interface to allocate a CQ for in-kernel users. A
  * CQ allocated with this interface will automatically be polled from the
  * specified context. The ULP must use wr->wr_cqe instead of wr->wr_id
  * to use this CQ abstraction.
  */
-struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
-                           int nr_cqe, int comp_vector,
-                           enum ib_poll_context poll_ctx, const char *caller)
+struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
+                                int nr_cqe, int comp_vector,
+                                enum ib_poll_context poll_ctx,
+                                const char *caller, struct ib_udata *udata)
 {
        struct ib_cq_init_attr cq_attr = {
                .cqe            = nr_cqe,
@@ -145,7 +147,7 @@ struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
        struct ib_cq *cq;
        int ret = -ENOMEM;
 
-       cq = dev->ops.create_cq(dev, &cq_attr, NULL, NULL);
+       cq = dev->ops.create_cq(dev, &cq_attr, NULL);
        if (IS_ERR(cq))
                return cq;
 
@@ -193,16 +195,17 @@ out_free_wc:
        kfree(cq->wc);
        rdma_restrack_del(&cq->res);
 out_destroy_cq:
-       cq->device->ops.destroy_cq(cq);
+       cq->device->ops.destroy_cq(cq, udata);
        return ERR_PTR(ret);
 }
-EXPORT_SYMBOL(__ib_alloc_cq);
+EXPORT_SYMBOL(__ib_alloc_cq_user);
 
 /**
  * ib_free_cq - free a completion queue
  * @cq:                completion queue to free.
+ * @udata:     User data or NULL for kernel object
  */
-void ib_free_cq(struct ib_cq *cq)
+void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
        int ret;
 
@@ -225,7 +228,7 @@ void ib_free_cq(struct ib_cq *cq)
 
        kfree(cq->wc);
        rdma_restrack_del(&cq->res);
-       ret = cq->device->ops.destroy_cq(cq);
+       ret = cq->device->ops.destroy_cq(cq, udata);
        WARN_ON_ONCE(ret);
 }
-EXPORT_SYMBOL(ib_free_cq);
+EXPORT_SYMBOL(ib_free_cq_user);
index 7421ec4..78dc07c 100644 (file)
@@ -38,6 +38,8 @@
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/netdevice.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <linux/security.h>
 #include <linux/notifier.h>
 #include <linux/hashtable.h>
@@ -101,6 +103,54 @@ static DECLARE_RWSEM(clients_rwsem);
  * be registered.
  */
 #define CLIENT_DATA_REGISTERED XA_MARK_1
+
+/**
+ * struct rdma_dev_net - rdma net namespace metadata for a net
+ * @net:       Pointer to owner net namespace
+ * @id:                xarray id to identify the net namespace.
+ */
+struct rdma_dev_net {
+       possible_net_t net;
+       u32 id;
+};
+
+static unsigned int rdma_dev_net_id;
+
+/*
+ * A list of net namespaces is maintained in an xarray. This is necessary
+ * because we can't get the locking right using the existing net ns list. We
+ * would require a init_net callback after the list is updated.
+ */
+static DEFINE_XARRAY_FLAGS(rdma_nets, XA_FLAGS_ALLOC);
+/*
+ * rwsem to protect accessing the rdma_nets xarray entries.
+ */
+static DECLARE_RWSEM(rdma_nets_rwsem);
+
+bool ib_devices_shared_netns = true;
+module_param_named(netns_mode, ib_devices_shared_netns, bool, 0444);
+MODULE_PARM_DESC(netns_mode,
+                "Share device among net namespaces; default=1 (shared)");
+/**
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ *                          from a specified net namespace or not.
+ * @device:    Pointer to rdma device which needs to be checked
+ * @net:       Pointer to net namesapce for which access to be checked
+ *
+ * rdma_dev_access_netns() - Return whether a rdma device can be accessed
+ *                          from a specified net namespace or not. When
+ *                          rdma device is in shared mode, it ignores the
+ *                          net namespace. When rdma device is exclusive
+ *                          to a net namespace, rdma device net namespace is
+ *                          checked against the specified one.
+ */
+bool rdma_dev_access_netns(const struct ib_device *dev, const struct net *net)
+{
+       return (ib_devices_shared_netns ||
+               net_eq(read_pnet(&dev->coredev.rdma_net), net));
+}
+EXPORT_SYMBOL(rdma_dev_access_netns);
+
 /*
  * xarray has this behavior where it won't iterate over NULL values stored in
  * allocated arrays.  So we need our own iterator to see all values stored in
@@ -147,10 +197,73 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
 static void ib_policy_change_task(struct work_struct *work);
 static DECLARE_WORK(ib_policy_change_work, ib_policy_change_task);
 
+static void __ibdev_printk(const char *level, const struct ib_device *ibdev,
+                          struct va_format *vaf)
+{
+       if (ibdev && ibdev->dev.parent)
+               dev_printk_emit(level[1] - '0',
+                               ibdev->dev.parent,
+                               "%s %s %s: %pV",
+                               dev_driver_string(ibdev->dev.parent),
+                               dev_name(ibdev->dev.parent),
+                               dev_name(&ibdev->dev),
+                               vaf);
+       else if (ibdev)
+               printk("%s%s: %pV",
+                      level, dev_name(&ibdev->dev), vaf);
+       else
+               printk("%s(NULL ib_device): %pV", level, vaf);
+}
+
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+                 const char *format, ...)
+{
+       struct va_format vaf;
+       va_list args;
+
+       va_start(args, format);
+
+       vaf.fmt = format;
+       vaf.va = &args;
+
+       __ibdev_printk(level, ibdev, &vaf);
+
+       va_end(args);
+}
+EXPORT_SYMBOL(ibdev_printk);
+
+#define define_ibdev_printk_level(func, level)                  \
+void func(const struct ib_device *ibdev, const char *fmt, ...)  \
+{                                                               \
+       struct va_format vaf;                                   \
+       va_list args;                                           \
+                                                               \
+       va_start(args, fmt);                                    \
+                                                               \
+       vaf.fmt = fmt;                                          \
+       vaf.va = &args;                                         \
+                                                               \
+       __ibdev_printk(level, ibdev, &vaf);                     \
+                                                               \
+       va_end(args);                                           \
+}                                                               \
+EXPORT_SYMBOL(func);
+
+define_ibdev_printk_level(ibdev_emerg, KERN_EMERG);
+define_ibdev_printk_level(ibdev_alert, KERN_ALERT);
+define_ibdev_printk_level(ibdev_crit, KERN_CRIT);
+define_ibdev_printk_level(ibdev_err, KERN_ERR);
+define_ibdev_printk_level(ibdev_warn, KERN_WARNING);
+define_ibdev_printk_level(ibdev_notice, KERN_NOTICE);
+define_ibdev_printk_level(ibdev_info, KERN_INFO);
+
 static struct notifier_block ibdev_lsm_nb = {
        .notifier_call = ib_security_change,
 };
 
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+                                struct net *net);
+
 /* Pointer to the RCU head at the start of the ib_port_data array */
 struct ib_port_data_rcu {
        struct rcu_head rcu_head;
@@ -200,16 +313,22 @@ static int ib_device_check_mandatory(struct ib_device *device)
  * Caller must perform ib_device_put() to return the device reference count
  * when ib_device_get_by_index() returns valid device pointer.
  */
-struct ib_device *ib_device_get_by_index(u32 index)
+struct ib_device *ib_device_get_by_index(const struct net *net, u32 index)
 {
        struct ib_device *device;
 
        down_read(&devices_rwsem);
        device = xa_load(&devices, index);
        if (device) {
+               if (!rdma_dev_access_netns(device, net)) {
+                       device = NULL;
+                       goto out;
+               }
+
                if (!ib_device_try_get(device))
                        device = NULL;
        }
+out:
        up_read(&devices_rwsem);
        return device;
 }
@@ -268,6 +387,26 @@ struct ib_device *ib_device_get_by_name(const char *name,
 }
 EXPORT_SYMBOL(ib_device_get_by_name);
 
+static int rename_compat_devs(struct ib_device *device)
+{
+       struct ib_core_device *cdev;
+       unsigned long index;
+       int ret = 0;
+
+       mutex_lock(&device->compat_devs_mutex);
+       xa_for_each (&device->compat_devs, index, cdev) {
+               ret = device_rename(&cdev->dev, dev_name(&device->dev));
+               if (ret) {
+                       dev_warn(&cdev->dev,
+                                "Fail to rename compatdev to new name %s\n",
+                                dev_name(&device->dev));
+                       break;
+               }
+       }
+       mutex_unlock(&device->compat_devs_mutex);
+       return ret;
+}
+
 int ib_device_rename(struct ib_device *ibdev, const char *name)
 {
        int ret;
@@ -287,6 +426,7 @@ int ib_device_rename(struct ib_device *ibdev, const char *name)
        if (ret)
                goto out;
        strlcpy(ibdev->name, name, IB_DEVICE_NAME_MAX);
+       ret = rename_compat_devs(ibdev);
 out:
        up_write(&devices_rwsem);
        return ret;
@@ -336,6 +476,7 @@ static void ib_device_release(struct device *device)
        WARN_ON(refcount_read(&dev->refcount));
        ib_cache_release_one(dev);
        ib_security_release_port_pkey_list(dev);
+       xa_destroy(&dev->compat_devs);
        xa_destroy(&dev->client_data);
        if (dev->port_data)
                kfree_rcu(container_of(dev->port_data, struct ib_port_data_rcu,
@@ -357,12 +498,42 @@ static int ib_device_uevent(struct device *device,
        return 0;
 }
 
+static const void *net_namespace(struct device *d)
+{
+       struct ib_core_device *coredev =
+                       container_of(d, struct ib_core_device, dev);
+
+       return read_pnet(&coredev->rdma_net);
+}
+
 static struct class ib_class = {
        .name    = "infiniband",
        .dev_release = ib_device_release,
        .dev_uevent = ib_device_uevent,
+       .ns_type = &net_ns_type_operations,
+       .namespace = net_namespace,
 };
 
+static void rdma_init_coredev(struct ib_core_device *coredev,
+                             struct ib_device *dev, struct net *net)
+{
+       /* This BUILD_BUG_ON is intended to catch layout change
+        * of union of ib_core_device and device.
+        * dev must be the first element as ib_core and providers
+        * driver uses it. Adding anything in ib_core_device before
+        * device will break this assumption.
+        */
+       BUILD_BUG_ON(offsetof(struct ib_device, coredev.dev) !=
+                    offsetof(struct ib_device, dev));
+
+       coredev->dev.class = &ib_class;
+       coredev->dev.groups = dev->groups;
+       device_initialize(&coredev->dev);
+       coredev->owner = dev;
+       INIT_LIST_HEAD(&coredev->port_list);
+       write_pnet(&coredev->rdma_net, net);
+}
+
 /**
  * _ib_alloc_device - allocate an IB device struct
  * @size:size of structure to allocate
@@ -389,10 +560,8 @@ struct ib_device *_ib_alloc_device(size_t size)
                return NULL;
        }
 
-       device->dev.class = &ib_class;
        device->groups[0] = &ib_dev_attr_group;
-       device->dev.groups = device->groups;
-       device_initialize(&device->dev);
+       rdma_init_coredev(&device->coredev, device, &init_net);
 
        INIT_LIST_HEAD(&device->event_handler_list);
        spin_lock_init(&device->event_handler_lock);
@@ -403,7 +572,8 @@ struct ib_device *_ib_alloc_device(size_t size)
         */
        xa_init_flags(&device->client_data, XA_FLAGS_ALLOC);
        init_rwsem(&device->client_data_rwsem);
-       INIT_LIST_HEAD(&device->port_list);
+       xa_init_flags(&device->compat_devs, XA_FLAGS_ALLOC);
+       mutex_init(&device->compat_devs_mutex);
        init_completion(&device->unreg_completion);
        INIT_WORK(&device->unregistration_work, ib_unregister_work);
 
@@ -436,6 +606,7 @@ void ib_dealloc_device(struct ib_device *device)
        /* Expedite releasing netdev references */
        free_netdevs(device);
 
+       WARN_ON(!xa_empty(&device->compat_devs));
        WARN_ON(!xa_empty(&device->client_data));
        WARN_ON(refcount_read(&device->refcount));
        rdma_restrack_clean(device);
@@ -644,6 +815,283 @@ static int ib_security_change(struct notifier_block *nb, unsigned long event,
        return NOTIFY_OK;
 }
 
+static void compatdev_release(struct device *dev)
+{
+       struct ib_core_device *cdev =
+               container_of(dev, struct ib_core_device, dev);
+
+       kfree(cdev);
+}
+
+static int add_one_compat_dev(struct ib_device *device,
+                             struct rdma_dev_net *rnet)
+{
+       struct ib_core_device *cdev;
+       int ret;
+
+       lockdep_assert_held(&rdma_nets_rwsem);
+       if (!ib_devices_shared_netns)
+               return 0;
+
+       /*
+        * Create and add compat device in all namespaces other than where it
+        * is currently bound to.
+        */
+       if (net_eq(read_pnet(&rnet->net),
+                  read_pnet(&device->coredev.rdma_net)))
+               return 0;
+
+       /*
+        * The first of init_net() or ib_register_device() to take the
+        * compat_devs_mutex wins and gets to add the device. Others will wait
+        * for completion here.
+        */
+       mutex_lock(&device->compat_devs_mutex);
+       cdev = xa_load(&device->compat_devs, rnet->id);
+       if (cdev) {
+               ret = 0;
+               goto done;
+       }
+       ret = xa_reserve(&device->compat_devs, rnet->id, GFP_KERNEL);
+       if (ret)
+               goto done;
+
+       cdev = kzalloc(sizeof(*cdev), GFP_KERNEL);
+       if (!cdev) {
+               ret = -ENOMEM;
+               goto cdev_err;
+       }
+
+       cdev->dev.parent = device->dev.parent;
+       rdma_init_coredev(cdev, device, read_pnet(&rnet->net));
+       cdev->dev.release = compatdev_release;
+       dev_set_name(&cdev->dev, "%s", dev_name(&device->dev));
+
+       ret = device_add(&cdev->dev);
+       if (ret)
+               goto add_err;
+       ret = ib_setup_port_attrs(cdev);
+       if (ret)
+               goto port_err;
+
+       ret = xa_err(xa_store(&device->compat_devs, rnet->id,
+                             cdev, GFP_KERNEL));
+       if (ret)
+               goto insert_err;
+
+       mutex_unlock(&device->compat_devs_mutex);
+       return 0;
+
+insert_err:
+       ib_free_port_attrs(cdev);
+port_err:
+       device_del(&cdev->dev);
+add_err:
+       put_device(&cdev->dev);
+cdev_err:
+       xa_release(&device->compat_devs, rnet->id);
+done:
+       mutex_unlock(&device->compat_devs_mutex);
+       return ret;
+}
+
+static void remove_one_compat_dev(struct ib_device *device, u32 id)
+{
+       struct ib_core_device *cdev;
+
+       mutex_lock(&device->compat_devs_mutex);
+       cdev = xa_erase(&device->compat_devs, id);
+       mutex_unlock(&device->compat_devs_mutex);
+       if (cdev) {
+               ib_free_port_attrs(cdev);
+               device_del(&cdev->dev);
+               put_device(&cdev->dev);
+       }
+}
+
+static void remove_compat_devs(struct ib_device *device)
+{
+       struct ib_core_device *cdev;
+       unsigned long index;
+
+       xa_for_each (&device->compat_devs, index, cdev)
+               remove_one_compat_dev(device, index);
+}
+
+static int add_compat_devs(struct ib_device *device)
+{
+       struct rdma_dev_net *rnet;
+       unsigned long index;
+       int ret = 0;
+
+       lockdep_assert_held(&devices_rwsem);
+
+       down_read(&rdma_nets_rwsem);
+       xa_for_each (&rdma_nets, index, rnet) {
+               ret = add_one_compat_dev(device, rnet);
+               if (ret)
+                       break;
+       }
+       up_read(&rdma_nets_rwsem);
+       return ret;
+}
+
+static void remove_all_compat_devs(void)
+{
+       struct ib_compat_device *cdev;
+       struct ib_device *dev;
+       unsigned long index;
+
+       down_read(&devices_rwsem);
+       xa_for_each (&devices, index, dev) {
+               unsigned long c_index = 0;
+
+               /* Hold nets_rwsem so that any other thread modifying this
+                * system param can sync with this thread.
+                */
+               down_read(&rdma_nets_rwsem);
+               xa_for_each (&dev->compat_devs, c_index, cdev)
+                       remove_one_compat_dev(dev, c_index);
+               up_read(&rdma_nets_rwsem);
+       }
+       up_read(&devices_rwsem);
+}
+
+static int add_all_compat_devs(void)
+{
+       struct rdma_dev_net *rnet;
+       struct ib_device *dev;
+       unsigned long index;
+       int ret = 0;
+
+       down_read(&devices_rwsem);
+       xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+               unsigned long net_index = 0;
+
+               /* Hold nets_rwsem so that any other thread modifying this
+                * system param can sync with this thread.
+                */
+               down_read(&rdma_nets_rwsem);
+               xa_for_each (&rdma_nets, net_index, rnet) {
+                       ret = add_one_compat_dev(dev, rnet);
+                       if (ret)
+                               break;
+               }
+               up_read(&rdma_nets_rwsem);
+       }
+       up_read(&devices_rwsem);
+       if (ret)
+               remove_all_compat_devs();
+       return ret;
+}
+
+int rdma_compatdev_set(u8 enable)
+{
+       struct rdma_dev_net *rnet;
+       unsigned long index;
+       int ret = 0;
+
+       down_write(&rdma_nets_rwsem);
+       if (ib_devices_shared_netns == enable) {
+               up_write(&rdma_nets_rwsem);
+               return 0;
+       }
+
+       /* enable/disable of compat devices is not supported
+        * when more than default init_net exists.
+        */
+       xa_for_each (&rdma_nets, index, rnet) {
+               ret++;
+               break;
+       }
+       if (!ret)
+               ib_devices_shared_netns = enable;
+       up_write(&rdma_nets_rwsem);
+       if (ret)
+               return -EBUSY;
+
+       if (enable)
+               ret = add_all_compat_devs();
+       else
+               remove_all_compat_devs();
+       return ret;
+}
+
+static void rdma_dev_exit_net(struct net *net)
+{
+       struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+       struct ib_device *dev;
+       unsigned long index;
+       int ret;
+
+       down_write(&rdma_nets_rwsem);
+       /*
+        * Prevent the ID from being re-used and hide the id from xa_for_each.
+        */
+       ret = xa_err(xa_store(&rdma_nets, rnet->id, NULL, GFP_KERNEL));
+       WARN_ON(ret);
+       up_write(&rdma_nets_rwsem);
+
+       down_read(&devices_rwsem);
+       xa_for_each (&devices, index, dev) {
+               get_device(&dev->dev);
+               /*
+                * Release the devices_rwsem so that pontentially blocking
+                * device_del, doesn't hold the devices_rwsem for too long.
+                */
+               up_read(&devices_rwsem);
+
+               remove_one_compat_dev(dev, rnet->id);
+
+               /*
+                * If the real device is in the NS then move it back to init.
+                */
+               rdma_dev_change_netns(dev, net, &init_net);
+
+               put_device(&dev->dev);
+               down_read(&devices_rwsem);
+       }
+       up_read(&devices_rwsem);
+
+       xa_erase(&rdma_nets, rnet->id);
+}
+
+static __net_init int rdma_dev_init_net(struct net *net)
+{
+       struct rdma_dev_net *rnet = net_generic(net, rdma_dev_net_id);
+       unsigned long index;
+       struct ib_device *dev;
+       int ret;
+
+       /* No need to create any compat devices in default init_net. */
+       if (net_eq(net, &init_net))
+               return 0;
+
+       write_pnet(&rnet->net, net);
+
+       ret = xa_alloc(&rdma_nets, &rnet->id, rnet, xa_limit_32b, GFP_KERNEL);
+       if (ret)
+               return ret;
+
+       down_read(&devices_rwsem);
+       xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+               /* Hold nets_rwsem so that netlink command cannot change
+                * system configuration for device sharing mode.
+                */
+               down_read(&rdma_nets_rwsem);
+               ret = add_one_compat_dev(dev, rnet);
+               up_read(&rdma_nets_rwsem);
+               if (ret)
+                       break;
+       }
+       up_read(&devices_rwsem);
+
+       if (ret)
+               rdma_dev_exit_net(net);
+
+       return ret;
+}
+
 /*
  * Assign the unique string device name and the unique device index. This is
  * undone by ib_dealloc_device.
@@ -711,6 +1159,9 @@ static void setup_dma_device(struct ib_device *device)
                WARN_ON_ONCE(!parent);
                device->dma_device = parent;
        }
+       /* Setup default max segment size for all IB devices */
+       dma_set_max_seg_size(device->dma_device, SZ_2G);
+
 }
 
 /*
@@ -765,8 +1216,12 @@ static void disable_device(struct ib_device *device)
        ib_device_put(device);
        wait_for_completion(&device->unreg_completion);
 
-       /* Expedite removing unregistered pointers from the hash table */
-       free_netdevs(device);
+       /*
+        * compat devices must be removed after device refcount drops to zero.
+        * Otherwise init_net() may add more compatdevs after removing compat
+        * devices and before device is disabled.
+        */
+       remove_compat_devs(device);
 }
 
 /*
@@ -807,7 +1262,8 @@ static int enable_device_and_get(struct ib_device *device)
                        break;
        }
        up_read(&clients_rwsem);
-
+       if (!ret)
+               ret = add_compat_devs(device);
 out:
        up_read(&devices_rwsem);
        return ret;
@@ -847,6 +1303,11 @@ int ib_register_device(struct ib_device *device, const char *name)
 
        ib_device_register_rdmacg(device);
 
+       /*
+        * Ensure that ADD uevent is not fired because it
+        * is too early amd device is not initialized yet.
+        */
+       dev_set_uevent_suppress(&device->dev, true);
        ret = device_add(&device->dev);
        if (ret)
                goto cg_cleanup;
@@ -859,6 +1320,9 @@ int ib_register_device(struct ib_device *device, const char *name)
        }
 
        ret = enable_device_and_get(device);
+       dev_set_uevent_suppress(&device->dev, false);
+       /* Mark for userspace that device is ready */
+       kobject_uevent(&device->dev.kobj, KOBJ_ADD);
        if (ret) {
                void (*dealloc_fn)(struct ib_device *);
 
@@ -887,6 +1351,7 @@ int ib_register_device(struct ib_device *device, const char *name)
 dev_cleanup:
        device_del(&device->dev);
 cg_cleanup:
+       dev_set_uevent_suppress(&device->dev, false);
        ib_device_unregister_rdmacg(device);
        ib_cache_cleanup_one(device);
        return ret;
@@ -908,6 +1373,10 @@ static void __ib_unregister_device(struct ib_device *ib_dev)
                goto out;
 
        disable_device(ib_dev);
+
+       /* Expedite removing unregistered pointers from the hash table */
+       free_netdevs(ib_dev);
+
        ib_device_unregister_sysfs(ib_dev);
        device_del(&ib_dev->dev);
        ib_device_unregister_rdmacg(ib_dev);
@@ -1038,6 +1507,126 @@ void ib_unregister_device_queued(struct ib_device *ib_dev)
 }
 EXPORT_SYMBOL(ib_unregister_device_queued);
 
+/*
+ * The caller must pass in a device that has the kref held and the refcount
+ * released. If the device is in cur_net and still registered then it is moved
+ * into net.
+ */
+static int rdma_dev_change_netns(struct ib_device *device, struct net *cur_net,
+                                struct net *net)
+{
+       int ret2 = -EINVAL;
+       int ret;
+
+       mutex_lock(&device->unregistration_lock);
+
+       /*
+        * If a device not under ib_device_get() or if the unregistration_lock
+        * is not held, the namespace can be changed, or it can be unregistered.
+        * Check again under the lock.
+        */
+       if (refcount_read(&device->refcount) == 0 ||
+           !net_eq(cur_net, read_pnet(&device->coredev.rdma_net))) {
+               ret = -ENODEV;
+               goto out;
+       }
+
+       kobject_uevent(&device->dev.kobj, KOBJ_REMOVE);
+       disable_device(device);
+
+       /*
+        * At this point no one can be using the device, so it is safe to
+        * change the namespace.
+        */
+       write_pnet(&device->coredev.rdma_net, net);
+
+       down_read(&devices_rwsem);
+       /*
+        * Currently rdma devices are system wide unique. So the device name
+        * is guaranteed free in the new namespace. Publish the new namespace
+        * at the sysfs level.
+        */
+       ret = device_rename(&device->dev, dev_name(&device->dev));
+       up_read(&devices_rwsem);
+       if (ret) {
+               dev_warn(&device->dev,
+                        "%s: Couldn't rename device after namespace change\n",
+                        __func__);
+               /* Try and put things back and re-enable the device */
+               write_pnet(&device->coredev.rdma_net, cur_net);
+       }
+
+       ret2 = enable_device_and_get(device);
+       if (ret2) {
+               /*
+                * This shouldn't really happen, but if it does, let the user
+                * retry at later point. So don't disable the device.
+                */
+               dev_warn(&device->dev,
+                        "%s: Couldn't re-enable device after namespace change\n",
+                        __func__);
+       }
+       kobject_uevent(&device->dev.kobj, KOBJ_ADD);
+
+       ib_device_put(device);
+out:
+       mutex_unlock(&device->unregistration_lock);
+       if (ret)
+               return ret;
+       return ret2;
+}
+
+int ib_device_set_netns_put(struct sk_buff *skb,
+                           struct ib_device *dev, u32 ns_fd)
+{
+       struct net *net;
+       int ret;
+
+       net = get_net_ns_by_fd(ns_fd);
+       if (IS_ERR(net)) {
+               ret = PTR_ERR(net);
+               goto net_err;
+       }
+
+       if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) {
+               ret = -EPERM;
+               goto ns_err;
+       }
+
+       /*
+        * Currently supported only for those providers which support
+        * disassociation and don't do port specific sysfs init. Once a
+        * port_cleanup infrastructure is implemented, this limitation will be
+        * removed.
+        */
+       if (!dev->ops.disassociate_ucontext || dev->ops.init_port ||
+           ib_devices_shared_netns) {
+               ret = -EOPNOTSUPP;
+               goto ns_err;
+       }
+
+       get_device(&dev->dev);
+       ib_device_put(dev);
+       ret = rdma_dev_change_netns(dev, current->nsproxy->net_ns, net);
+       put_device(&dev->dev);
+
+       put_net(net);
+       return ret;
+
+ns_err:
+       put_net(net);
+net_err:
+       ib_device_put(dev);
+       return ret;
+}
+
+static struct pernet_operations rdma_dev_net_ops = {
+       .init = rdma_dev_init_net,
+       .exit = rdma_dev_exit_net,
+       .id = &rdma_dev_net_id,
+       .size = sizeof(struct rdma_dev_net),
+};
+
 static int assign_client_id(struct ib_client *client)
 {
        int ret;
@@ -1515,6 +2104,9 @@ int ib_enum_all_devs(nldev_callback nldev_cb, struct sk_buff *skb,
 
        down_read(&devices_rwsem);
        xa_for_each_marked (&devices, index, dev, DEVICE_REGISTERED) {
+               if (!rdma_dev_access_netns(dev, sock_net(skb->sk)))
+                       continue;
+
                ret = nldev_cb(dev, skb, cb, idx);
                if (ret)
                        break;
@@ -1787,6 +2379,14 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, get_vf_config);
        SET_DEVICE_OP(dev_ops, get_vf_stats);
        SET_DEVICE_OP(dev_ops, init_port);
+       SET_DEVICE_OP(dev_ops, iw_accept);
+       SET_DEVICE_OP(dev_ops, iw_add_ref);
+       SET_DEVICE_OP(dev_ops, iw_connect);
+       SET_DEVICE_OP(dev_ops, iw_create_listen);
+       SET_DEVICE_OP(dev_ops, iw_destroy_listen);
+       SET_DEVICE_OP(dev_ops, iw_get_qp);
+       SET_DEVICE_OP(dev_ops, iw_reject);
+       SET_DEVICE_OP(dev_ops, iw_rem_ref);
        SET_DEVICE_OP(dev_ops, map_mr_sg);
        SET_DEVICE_OP(dev_ops, map_phys_fmr);
        SET_DEVICE_OP(dev_ops, mmap);
@@ -1823,7 +2423,9 @@ void ib_set_device_ops(struct ib_device *dev, const struct ib_device_ops *ops)
        SET_DEVICE_OP(dev_ops, set_vf_link_state);
        SET_DEVICE_OP(dev_ops, unmap_fmr);
 
+       SET_OBJ_SIZE(dev_ops, ib_ah);
        SET_OBJ_SIZE(dev_ops, ib_pd);
+       SET_OBJ_SIZE(dev_ops, ib_srq);
        SET_OBJ_SIZE(dev_ops, ib_ucontext);
 }
 EXPORT_SYMBOL(ib_set_device_ops);
@@ -1903,12 +2505,20 @@ static int __init ib_core_init(void)
                goto err_sa;
        }
 
+       ret = register_pernet_device(&rdma_dev_net_ops);
+       if (ret) {
+               pr_warn("Couldn't init compat dev. ret %d\n", ret);
+               goto err_compat;
+       }
+
        nldev_init();
        rdma_nl_register(RDMA_NL_LS, ibnl_ls_cb_table);
        roce_gid_mgmt_init();
 
        return 0;
 
+err_compat:
+       unregister_lsm_notifier(&ibdev_lsm_nb);
 err_sa:
        ib_sa_cleanup();
 err_mad:
@@ -1933,6 +2543,7 @@ static void __exit ib_core_cleanup(void)
        roce_gid_mgmt_cleanup();
        nldev_exit();
        rdma_nl_unregister(RDMA_NL_LS);
+       unregister_pernet_device(&rdma_dev_net_ops);
        unregister_lsm_notifier(&ibdev_lsm_nb);
        ib_sa_cleanup();
        ib_mad_cleanup();
@@ -1950,5 +2561,8 @@ static void __exit ib_core_cleanup(void)
 
 MODULE_ALIAS_RDMA_NETLINK(RDMA_NL_LS, 4);
 
-subsys_initcall(ib_core_init);
+/* ib core relies on netdev stack to first register net_ns_type_operations
+ * ns kobject type before ib_core initialization.
+ */
+fs_initcall(ib_core_init);
 module_exit(ib_core_cleanup);
index 732637c..72141c5 100644 (file)
@@ -394,7 +394,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
                cm_id_priv->state = IW_CM_STATE_DESTROYING;
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
                /* destroy the listening endpoint */
-               cm_id->device->iwcm->destroy_listen(cm_id);
+               cm_id->device->ops.iw_destroy_listen(cm_id);
                spin_lock_irqsave(&cm_id_priv->lock, flags);
                break;
        case IW_CM_STATE_ESTABLISHED:
@@ -417,7 +417,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
                 */
                cm_id_priv->state = IW_CM_STATE_DESTROYING;
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
-               cm_id->device->iwcm->reject(cm_id, NULL, 0);
+               cm_id->device->ops.iw_reject(cm_id, NULL, 0);
                spin_lock_irqsave(&cm_id_priv->lock, flags);
                break;
        case IW_CM_STATE_CONN_SENT:
@@ -427,7 +427,7 @@ static void destroy_cm_id(struct iw_cm_id *cm_id)
                break;
        }
        if (cm_id_priv->qp) {
-               cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
                cm_id_priv->qp = NULL;
        }
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -504,7 +504,7 @@ static void iw_cm_check_wildcard(struct sockaddr_storage *pm_addr,
 static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
 {
        const char *devname = dev_name(&cm_id->device->dev);
-       const char *ifname = cm_id->device->iwcm->ifname;
+       const char *ifname = cm_id->device->iw_ifname;
        struct iwpm_dev_data pm_reg_msg = {};
        struct iwpm_sa_data pm_msg;
        int status;
@@ -526,7 +526,7 @@ static int iw_cm_map(struct iw_cm_id *cm_id, bool active)
        cm_id->mapped = true;
        pm_msg.loc_addr = cm_id->local_addr;
        pm_msg.rem_addr = cm_id->remote_addr;
-       pm_msg.flags = (cm_id->device->iwcm->driver_flags & IW_F_NO_PORT_MAP) ?
+       pm_msg.flags = (cm_id->device->iw_driver_flags & IW_F_NO_PORT_MAP) ?
                       IWPM_FLAGS_NO_PORT_MAP : 0;
        if (active)
                status = iwpm_add_and_query_mapping(&pm_msg,
@@ -577,7 +577,8 @@ int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
                ret = iw_cm_map(cm_id, false);
                if (!ret)
-                       ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+                       ret = cm_id->device->ops.iw_create_listen(cm_id,
+                                                                 backlog);
                if (ret)
                        cm_id_priv->state = IW_CM_STATE_IDLE;
                spin_lock_irqsave(&cm_id_priv->lock, flags);
@@ -617,7 +618,7 @@ int iw_cm_reject(struct iw_cm_id *cm_id,
        cm_id_priv->state = IW_CM_STATE_IDLE;
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
-       ret = cm_id->device->iwcm->reject(cm_id, private_data,
+       ret = cm_id->device->ops.iw_reject(cm_id, private_data,
                                          private_data_len);
 
        clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
@@ -653,25 +654,25 @@ int iw_cm_accept(struct iw_cm_id *cm_id,
                return -EINVAL;
        }
        /* Get the ib_qp given the QPN */
-       qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+       qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
        if (!qp) {
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
                clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
                wake_up_all(&cm_id_priv->connect_wait);
                return -EINVAL;
        }
-       cm_id->device->iwcm->add_ref(qp);
+       cm_id->device->ops.iw_add_ref(qp);
        cm_id_priv->qp = qp;
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
-       ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+       ret = cm_id->device->ops.iw_accept(cm_id, iw_param);
        if (ret) {
                /* An error on accept precludes provider events */
                BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
                cm_id_priv->state = IW_CM_STATE_IDLE;
                spin_lock_irqsave(&cm_id_priv->lock, flags);
                if (cm_id_priv->qp) {
-                       cm_id->device->iwcm->rem_ref(qp);
+                       cm_id->device->ops.iw_rem_ref(qp);
                        cm_id_priv->qp = NULL;
                }
                spin_unlock_irqrestore(&cm_id_priv->lock, flags);
@@ -712,25 +713,25 @@ int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
        }
 
        /* Get the ib_qp given the QPN */
-       qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+       qp = cm_id->device->ops.iw_get_qp(cm_id->device, iw_param->qpn);
        if (!qp) {
                ret = -EINVAL;
                goto err;
        }
-       cm_id->device->iwcm->add_ref(qp);
+       cm_id->device->ops.iw_add_ref(qp);
        cm_id_priv->qp = qp;
        cm_id_priv->state = IW_CM_STATE_CONN_SENT;
        spin_unlock_irqrestore(&cm_id_priv->lock, flags);
 
        ret = iw_cm_map(cm_id, true);
        if (!ret)
-               ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+               ret = cm_id->device->ops.iw_connect(cm_id, iw_param);
        if (!ret)
                return 0;       /* success */
 
        spin_lock_irqsave(&cm_id_priv->lock, flags);
        if (cm_id_priv->qp) {
-               cm_id->device->iwcm->rem_ref(qp);
+               cm_id->device->ops.iw_rem_ref(qp);
                cm_id_priv->qp = NULL;
        }
        cm_id_priv->state = IW_CM_STATE_IDLE;
@@ -895,7 +896,7 @@ static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
                cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
        } else {
                /* REJECTED or RESET */
-               cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
                cm_id_priv->qp = NULL;
                cm_id_priv->state = IW_CM_STATE_IDLE;
        }
@@ -946,7 +947,7 @@ static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
        spin_lock_irqsave(&cm_id_priv->lock, flags);
 
        if (cm_id_priv->qp) {
-               cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+               cm_id_priv->id.device->ops.iw_rem_ref(cm_id_priv->qp);
                cm_id_priv->qp = NULL;
        }
        switch (cm_id_priv->state) {
index e742a6a..cc99479 100644 (file)
@@ -3,7 +3,7 @@
  * Copyright (c) 2005 Intel Corporation.  All rights reserved.
  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
  * Copyright (c) 2009 HNR Consulting. All rights reserved.
- * Copyright (c) 2014 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2014,2018 Intel Corporation.  All rights reserved.
  *
  * This software is available to you under a choice of one of two
  * licenses.  You may choose to be licensed under the terms of the GNU
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/dma-mapping.h>
-#include <linux/idr.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/security.h>
+#include <linux/xarray.h>
 #include <rdma/ib_cache.h>
 
 #include "mad_priv.h"
 #include "opa_smi.h"
 #include "agent.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_mad.h>
+
+#ifdef CONFIG_TRACEPOINTS
+static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
+                         struct ib_mad_qp_info *qp_info,
+                         struct trace_event_raw_ib_mad_send_template *entry)
+{
+       u16 pkey;
+       struct ib_device *dev = qp_info->port_priv->device;
+       u8 pnum = qp_info->port_priv->port_num;
+       struct ib_ud_wr *wr = &mad_send_wr->send_wr;
+       struct rdma_ah_attr attr = {};
+
+       rdma_query_ah(wr->ah, &attr);
+
+       /* These are common */
+       entry->sl = attr.sl;
+       ib_query_pkey(dev, pnum, wr->pkey_index, &pkey);
+       entry->pkey = pkey;
+       entry->rqpn = wr->remote_qpn;
+       entry->rqkey = wr->remote_qkey;
+       entry->dlid = rdma_ah_get_dlid(&attr);
+}
+#endif
+
 static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
 static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
 
@@ -59,12 +85,9 @@ MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests
 module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
 MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
 
-/*
- * The mlx4 driver uses the top byte to distinguish which virtual function
- * generated the MAD, so we must avoid using it.
- */
-#define AGENT_ID_LIMIT         (1 << 24)
-static DEFINE_IDR(ib_mad_clients);
+/* Client ID 0 is used for snoop-only clients */
+static DEFINE_XARRAY_ALLOC1(ib_mad_clients);
+static u32 ib_mad_client_next;
 static struct list_head ib_mad_port_list;
 
 /* Port list lock */
@@ -389,18 +412,17 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
                goto error4;
        }
 
-       idr_preload(GFP_KERNEL);
-       idr_lock(&ib_mad_clients);
-       ret2 = idr_alloc_cyclic(&ib_mad_clients, mad_agent_priv, 0,
-                       AGENT_ID_LIMIT, GFP_ATOMIC);
-       idr_unlock(&ib_mad_clients);
-       idr_preload_end();
-
+       /*
+        * The mlx4 driver uses the top byte to distinguish which virtual
+        * function generated the MAD, so we must avoid using it.
+        */
+       ret2 = xa_alloc_cyclic(&ib_mad_clients, &mad_agent_priv->agent.hi_tid,
+                       mad_agent_priv, XA_LIMIT(0, (1 << 24) - 1),
+                       &ib_mad_client_next, GFP_KERNEL);
        if (ret2 < 0) {
                ret = ERR_PTR(ret2);
                goto error5;
        }
-       mad_agent_priv->agent.hi_tid = ret2;
 
        /*
         * Make sure MAD registration (if supplied)
@@ -445,12 +467,11 @@ struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
        }
        spin_unlock_irq(&port_priv->reg_lock);
 
+       trace_ib_mad_create_agent(mad_agent_priv);
        return &mad_agent_priv->agent;
 error6:
        spin_unlock_irq(&port_priv->reg_lock);
-       idr_lock(&ib_mad_clients);
-       idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
-       idr_unlock(&ib_mad_clients);
+       xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
 error5:
        ib_mad_agent_security_cleanup(&mad_agent_priv->agent);
 error4:
@@ -602,6 +623,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
        struct ib_mad_port_private *port_priv;
 
        /* Note that we could still be handling received MADs */
+       trace_ib_mad_unregister_agent(mad_agent_priv);
 
        /*
         * Canceling all sends results in dropping received response
@@ -614,9 +636,7 @@ static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
        spin_lock_irq(&port_priv->reg_lock);
        remove_mad_reg_req(mad_agent_priv);
        spin_unlock_irq(&port_priv->reg_lock);
-       idr_lock(&ib_mad_clients);
-       idr_remove(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
-       idr_unlock(&ib_mad_clients);
+       xa_erase(&ib_mad_clients, mad_agent_priv->agent.hi_tid);
 
        flush_workqueue(port_priv->wq);
        ib_cancel_rmpp_recvs(mad_agent_priv);
@@ -821,6 +841,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
        if (opa && smp->class_version == OPA_SM_CLASS_VERSION) {
                u32 opa_drslid;
 
+               trace_ib_mad_handle_out_opa_smi(opa_smp);
+
                if ((opa_get_smp_direction(opa_smp)
                     ? opa_smp->route.dr.dr_dlid : opa_smp->route.dr.dr_slid) ==
                     OPA_LID_PERMISSIVE &&
@@ -846,6 +868,8 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
                    opa_smi_check_local_returning_smp(opa_smp, device) == IB_SMI_DISCARD)
                        goto out;
        } else {
+               trace_ib_mad_handle_out_ib_smi(smp);
+
                if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
                     IB_LID_PERMISSIVE &&
                     smi_handle_dr_smp_send(smp, rdma_cap_ib_switch(device), port_num) ==
@@ -1223,6 +1247,7 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
 
        spin_lock_irqsave(&qp_info->send_queue.lock, flags);
        if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+               trace_ib_mad_ib_send_mad(mad_send_wr, qp_info);
                ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr.wr,
                                   NULL);
                list = &qp_info->send_queue.list;
@@ -1756,7 +1781,7 @@ find_mad_agent(struct ib_mad_port_private *port_priv,
                 */
                hi_tid = be64_to_cpu(mad_hdr->tid) >> 32;
                rcu_read_lock();
-               mad_agent = idr_find(&ib_mad_clients, hi_tid);
+               mad_agent = xa_load(&ib_mad_clients, hi_tid);
                if (mad_agent && !atomic_inc_not_zero(&mad_agent->refcount))
                        mad_agent = NULL;
                rcu_read_unlock();
@@ -2077,6 +2102,8 @@ static enum smi_action handle_ib_smi(const struct ib_mad_port_private *port_priv
        enum smi_forward_action retsmi;
        struct ib_smp *smp = (struct ib_smp *)recv->mad;
 
+       trace_ib_mad_handle_ib_smi(smp);
+
        if (smi_handle_dr_smp_recv(smp,
                                   rdma_cap_ib_switch(port_priv->device),
                                   port_num,
@@ -2162,6 +2189,8 @@ handle_opa_smi(struct ib_mad_port_private *port_priv,
        enum smi_forward_action retsmi;
        struct opa_smp *smp = (struct opa_smp *)recv->mad;
 
+       trace_ib_mad_handle_opa_smi(smp);
+
        if (opa_smi_handle_dr_smp_recv(smp,
                                   rdma_cap_ib_switch(port_priv->device),
                                   port_num,
@@ -2286,6 +2315,9 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
        if (!validate_mad((const struct ib_mad_hdr *)recv->mad, qp_info, opa))
                goto out;
 
+       trace_ib_mad_recv_done_handler(qp_info, wc,
+                                      (struct ib_mad_hdr *)recv->mad);
+
        mad_size = recv->mad_size;
        response = alloc_mad_private(mad_size, GFP_KERNEL);
        if (!response)
@@ -2332,6 +2364,7 @@ static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc)
 
        mad_agent = find_mad_agent(port_priv, (const struct ib_mad_hdr *)recv->mad);
        if (mad_agent) {
+               trace_ib_mad_recv_done_agent(mad_agent);
                ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
                /*
                 * recv is freed up in error cases in ib_mad_complete_recv
@@ -2496,6 +2529,9 @@ static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc)
        send_queue = mad_list->mad_queue;
        qp_info = send_queue->qp_info;
 
+       trace_ib_mad_send_done_agent(mad_send_wr->mad_agent_priv);
+       trace_ib_mad_send_done_handler(mad_send_wr, wc);
+
 retry:
        ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
                            mad_send_wr->header_mapping,
@@ -2527,6 +2563,7 @@ retry:
        ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
 
        if (queued_send_wr) {
+               trace_ib_mad_send_done_resend(queued_send_wr, qp_info);
                ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr.wr,
                                   NULL);
                if (ret) {
@@ -2574,6 +2611,7 @@ static bool ib_mad_send_error(struct ib_mad_port_private *port_priv,
                if (mad_send_wr->retry) {
                        /* Repost send */
                        mad_send_wr->retry = 0;
+                       trace_ib_mad_error_handler(mad_send_wr, qp_info);
                        ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr,
                                           NULL);
                        if (!ret)
@@ -3356,9 +3394,6 @@ int ib_mad_init(void)
 
        INIT_LIST_HEAD(&ib_mad_port_list);
 
-       /* Client ID 0 is used for snoop-only clients */
-       idr_alloc(&ib_mad_clients, NULL, 0, 0, GFP_KERNEL);
-
        if (ib_register_client(&mad_client)) {
                pr_err("Couldn't register ib_mad client\n");
                return -EINVAL;
index 2165090..956b3a7 100644 (file)
@@ -73,14 +73,14 @@ struct ib_mad_private_header {
        struct ib_mad_recv_wc recv_wc;
        struct ib_wc wc;
        u64 mapping;
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_mad_private {
        struct ib_mad_private_header header;
        size_t mad_size;
        struct ib_grh grh;
        u8 mad[0];
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_rmpp_segment {
        struct list_head list;
index d50ff70..cd338dd 100644 (file)
@@ -804,7 +804,6 @@ static void mcast_event_handler(struct ib_event_handler *handler,
        switch (event->event) {
        case IB_EVENT_PORT_ERR:
        case IB_EVENT_LID_CHANGE:
-       case IB_EVENT_SM_CHANGE:
        case IB_EVENT_CLIENT_REREGISTER:
                mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
                break;
index 8532401..98eadd3 100644 (file)
@@ -116,6 +116,10 @@ static const struct nla_policy nldev_policy[RDMA_NLDEV_ATTR_MAX] = {
        [RDMA_NLDEV_ATTR_RES_CTXN]              = { .type = NLA_U32 },
        [RDMA_NLDEV_ATTR_LINK_TYPE]             = { .type = NLA_NUL_STRING,
                                    .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+       [RDMA_NLDEV_SYS_ATTR_NETNS_MODE]        = { .type = NLA_U8 },
+       [RDMA_NLDEV_ATTR_DEV_PROTOCOL]          = { .type = NLA_NUL_STRING,
+                                   .len = RDMA_NLDEV_ATTR_ENTRY_STRLEN },
+       [RDMA_NLDEV_NET_NS_FD]                  = { .type = NLA_U32 },
 };
 
 static int put_driver_name_print_type(struct sk_buff *msg, const char *name,
@@ -198,6 +202,8 @@ static int fill_nldev_handle(struct sk_buff *msg, struct ib_device *device)
 static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
 {
        char fw[IB_FW_VERSION_NAME_MAX];
+       int ret = 0;
+       u8 port;
 
        if (fill_nldev_handle(msg, device))
                return -EMSGSIZE;
@@ -226,7 +232,25 @@ static int fill_dev_info(struct sk_buff *msg, struct ib_device *device)
                return -EMSGSIZE;
        if (nla_put_u8(msg, RDMA_NLDEV_ATTR_DEV_NODE_TYPE, device->node_type))
                return -EMSGSIZE;
-       return 0;
+
+       /*
+        * Link type is determined on first port and mlx4 device
+        * which can potentially have two different link type for the same
+        * IB device is considered as better to be avoided in the future,
+        */
+       port = rdma_start_port(device);
+       if (rdma_cap_opa_mad(device, port))
+               ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "opa");
+       else if (rdma_protocol_ib(device, port))
+               ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "ib");
+       else if (rdma_protocol_iwarp(device, port))
+               ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "iw");
+       else if (rdma_protocol_roce(device, port))
+               ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL, "roce");
+       else if (rdma_protocol_usnic(device, port))
+               ret = nla_put_string(msg, RDMA_NLDEV_ATTR_DEV_PROTOCOL,
+                                    "usnic");
+       return ret;
 }
 
 static int fill_port_info(struct sk_buff *msg,
@@ -615,7 +639,7 @@ static int nldev_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
 
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -659,7 +683,7 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -669,9 +693,20 @@ static int nldev_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                nla_strlcpy(name, tb[RDMA_NLDEV_ATTR_DEV_NAME],
                            IB_DEVICE_NAME_MAX);
                err = ib_device_rename(device, name);
+               goto done;
        }
 
+       if (tb[RDMA_NLDEV_NET_NS_FD]) {
+               u32 ns_fd;
+
+               ns_fd = nla_get_u32(tb[RDMA_NLDEV_NET_NS_FD]);
+               err = ib_device_set_netns_put(skb, device, ns_fd);
+               goto put_done;
+       }
+
+done:
        ib_device_put(device);
+put_done:
        return err;
 }
 
@@ -707,7 +742,7 @@ static int nldev_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
 {
        /*
         * There is no need to take lock, because
-        * we are relying on ib_core's lists_rwsem
+        * we are relying on ib_core's locking.
         */
        return ib_enum_all_devs(_nldev_get_dumpit, skb, cb);
 }
@@ -730,7 +765,7 @@ static int nldev_port_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -784,7 +819,7 @@ static int nldev_port_get_dumpit(struct sk_buff *skb,
                return -EINVAL;
 
        ifindex = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(ifindex);
+       device = ib_device_get_by_index(sock_net(skb->sk), ifindex);
        if (!device)
                return -EINVAL;
 
@@ -839,7 +874,7 @@ static int nldev_res_get_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -887,7 +922,6 @@ static int _nldev_res_get_dumpit(struct ib_device *device,
                nlmsg_cancel(skb, nlh);
                goto out;
        }
-
        nlmsg_end(skb, nlh);
 
        idx++;
@@ -988,7 +1022,7 @@ static int res_get_common_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -1085,7 +1119,7 @@ static int res_get_common_dumpit(struct sk_buff *skb,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -1300,7 +1334,7 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
                return -EINVAL;
 
        index = nla_get_u32(tb[RDMA_NLDEV_ATTR_DEV_INDEX]);
-       device = ib_device_get_by_index(index);
+       device = ib_device_get_by_index(sock_net(skb->sk), index);
        if (!device)
                return -EINVAL;
 
@@ -1313,6 +1347,55 @@ static int nldev_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
        return 0;
 }
 
+static int nldev_get_sys_get_dumpit(struct sk_buff *skb,
+                                   struct netlink_callback *cb)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       struct nlmsghdr *nlh;
+       int err;
+
+       err = nlmsg_parse(cb->nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, NULL);
+       if (err)
+               return err;
+
+       nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+                       RDMA_NL_GET_TYPE(RDMA_NL_NLDEV,
+                                        RDMA_NLDEV_CMD_SYS_GET),
+                       0, 0);
+
+       err = nla_put_u8(skb, RDMA_NLDEV_SYS_ATTR_NETNS_MODE,
+                        (u8)ib_devices_shared_netns);
+       if (err) {
+               nlmsg_cancel(skb, nlh);
+               return err;
+       }
+
+       nlmsg_end(skb, nlh);
+       return skb->len;
+}
+
+static int nldev_set_sys_set_doit(struct sk_buff *skb, struct nlmsghdr *nlh,
+                                 struct netlink_ext_ack *extack)
+{
+       struct nlattr *tb[RDMA_NLDEV_ATTR_MAX];
+       u8 enable;
+       int err;
+
+       err = nlmsg_parse(nlh, 0, tb, RDMA_NLDEV_ATTR_MAX - 1,
+                         nldev_policy, extack);
+       if (err || !tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE])
+               return -EINVAL;
+
+       enable = nla_get_u8(tb[RDMA_NLDEV_SYS_ATTR_NETNS_MODE]);
+       /* Only 0 and 1 are supported */
+       if (enable > 1)
+               return -EINVAL;
+
+       err = rdma_compatdev_set(enable);
+       return err;
+}
+
 static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
        [RDMA_NLDEV_CMD_GET] = {
                .doit = nldev_get_doit,
@@ -1358,6 +1441,13 @@ static const struct rdma_nl_cbs nldev_cb_table[RDMA_NLDEV_NUM_OPS] = {
                .doit = nldev_res_get_pd_doit,
                .dump = nldev_res_get_pd_dumpit,
        },
+       [RDMA_NLDEV_CMD_SYS_GET] = {
+               .dump = nldev_get_sys_get_dumpit,
+       },
+       [RDMA_NLDEV_CMD_SYS_SET] = {
+               .doit = nldev_set_sys_set_doit,
+               .flags = RDMA_NL_ADMIN_PERM,
+       },
 };
 
 void __init nldev_init(void)
index 778375f..ccf4d06 100644 (file)
@@ -125,9 +125,10 @@ static void assert_uverbs_usecnt(struct ib_uobject *uobj,
  * and consumes the kref on the uobj.
  */
 static int uverbs_destroy_uobject(struct ib_uobject *uobj,
-                                 enum rdma_remove_reason reason)
+                                 enum rdma_remove_reason reason,
+                                 struct uverbs_attr_bundle *attrs)
 {
-       struct ib_uverbs_file *ufile = uobj->ufile;
+       struct ib_uverbs_file *ufile = attrs->ufile;
        unsigned long flags;
        int ret;
 
@@ -135,7 +136,8 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,
        assert_uverbs_usecnt(uobj, UVERBS_LOOKUP_WRITE);
 
        if (uobj->object) {
-               ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason);
+               ret = uobj->uapi_object->type_class->destroy_hw(uobj, reason,
+                                                               attrs);
                if (ret) {
                        if (ib_is_destroy_retryable(ret, reason, uobj))
                                return ret;
@@ -196,9 +198,9 @@ static int uverbs_destroy_uobject(struct ib_uobject *uobj,
  * version requires the caller to have already obtained an
  * LOOKUP_DESTROY uobject kref.
  */
-int uobj_destroy(struct ib_uobject *uobj)
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)
 {
-       struct ib_uverbs_file *ufile = uobj->ufile;
+       struct ib_uverbs_file *ufile = attrs->ufile;
        int ret;
 
        down_read(&ufile->hw_destroy_rwsem);
@@ -207,7 +209,7 @@ int uobj_destroy(struct ib_uobject *uobj)
        if (ret)
                goto out_unlock;
 
-       ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY);
+       ret = uverbs_destroy_uobject(uobj, RDMA_REMOVE_DESTROY, attrs);
        if (ret) {
                atomic_set(&uobj->usecnt, 0);
                goto out_unlock;
@@ -224,18 +226,17 @@ out_unlock:
  * uverbs_put_destroy.
  */
 struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
-                                     u32 id,
-                                     const struct uverbs_attr_bundle *attrs)
+                                     u32 id, struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *uobj;
        int ret;
 
        uobj = rdma_lookup_get_uobject(obj, attrs->ufile, id,
-                                      UVERBS_LOOKUP_DESTROY);
+                                      UVERBS_LOOKUP_DESTROY, attrs);
        if (IS_ERR(uobj))
                return uobj;
 
-       ret = uobj_destroy(uobj);
+       ret = uobj_destroy(uobj, attrs);
        if (ret) {
                rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_DESTROY);
                return ERR_PTR(ret);
@@ -249,7 +250,7 @@ struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
  * (negative errno on failure). For use by callers that do not need the uobj.
  */
 int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
-                          const struct uverbs_attr_bundle *attrs)
+                          struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *uobj;
 
@@ -296,25 +297,13 @@ static struct ib_uobject *alloc_uobj(struct ib_uverbs_file *ufile,
 
 static int idr_add_uobj(struct ib_uobject *uobj)
 {
-       int ret;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock(&uobj->ufile->idr_lock);
-
-       /*
-        * We start with allocating an idr pointing to NULL. This represents an
-        * object which isn't initialized yet. We'll replace it later on with
-        * the real object once we commit.
-        */
-       ret = idr_alloc(&uobj->ufile->idr, NULL, 0,
-                       min_t(unsigned long, U32_MAX - 1, INT_MAX), GFP_NOWAIT);
-       if (ret >= 0)
-               uobj->id = ret;
-
-       spin_unlock(&uobj->ufile->idr_lock);
-       idr_preload_end();
-
-       return ret < 0 ? ret : 0;
+       /*
+        * We start with allocating an idr pointing to NULL. This represents an
+        * object which isn't initialized yet. We'll replace it later on with
+        * the real object once we commit.
+        */
+       return xa_alloc(&uobj->ufile->idr, &uobj->id, NULL, xa_limit_32b,
+                       GFP_KERNEL);
 }
 
 /* Returns the ib_uobject or an error. The caller should check for IS_ERR. */
@@ -324,29 +313,20 @@ lookup_get_idr_uobject(const struct uverbs_api_object *obj,
                       enum rdma_lookup_mode mode)
 {
        struct ib_uobject *uobj;
-       unsigned long idrno = id;
 
        if (id < 0 || id > ULONG_MAX)
                return ERR_PTR(-EINVAL);
 
        rcu_read_lock();
-       /* object won't be released as we're protected in rcu */
-       uobj = idr_find(&ufile->idr, idrno);
-       if (!uobj) {
-               uobj = ERR_PTR(-ENOENT);
-               goto free;
-       }
-
        /*
         * The idr_find is guaranteed to return a pointer to something that
         * isn't freed yet, or NULL, as the free after idr_remove goes through
         * kfree_rcu(). However the object may still have been released and
         * kfree() could be called at any time.
         */
-       if (!kref_get_unless_zero(&uobj->ref))
+       uobj = xa_load(&ufile->idr, id);
+       if (!uobj || !kref_get_unless_zero(&uobj->ref))
                uobj = ERR_PTR(-ENOENT);
-
-free:
        rcu_read_unlock();
        return uobj;
 }
@@ -393,12 +373,13 @@ lookup_get_fd_uobject(const struct uverbs_api_object *obj,
 
 struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
                                           struct ib_uverbs_file *ufile, s64 id,
-                                          enum rdma_lookup_mode mode)
+                                          enum rdma_lookup_mode mode,
+                                          struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *uobj;
        int ret;
 
-       if (IS_ERR(obj) && PTR_ERR(obj) == -ENOMSG) {
+       if (obj == ERR_PTR(-ENOMSG)) {
                /* must be UVERBS_IDR_ANY_OBJECT, see uapi_get_object() */
                uobj = lookup_get_idr_uobject(NULL, ufile, id, mode);
                if (IS_ERR(uobj))
@@ -431,6 +412,8 @@ struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
        ret = uverbs_try_lock_object(uobj, mode);
        if (ret)
                goto free;
+       if (attrs)
+               attrs->context = uobj->context;
 
        return uobj;
 free:
@@ -438,38 +421,6 @@ free:
        uverbs_uobject_put(uobj);
        return ERR_PTR(ret);
 }
-struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
-                                 u32 object_id,
-                                 struct uverbs_attr_bundle *attrs)
-{
-       struct ib_uobject *uobj;
-
-       uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
-                                      object_id, UVERBS_LOOKUP_READ);
-       if (IS_ERR(uobj))
-               return uobj;
-
-       attrs->context = uobj->context;
-
-       return uobj;
-}
-
-struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
-                                  u32 object_id,
-                                  struct uverbs_attr_bundle *attrs)
-{
-       struct ib_uobject *uobj;
-
-       uobj = rdma_lookup_get_uobject(uobj_get_type(attrs, type), attrs->ufile,
-                                      object_id, UVERBS_LOOKUP_WRITE);
-
-       if (IS_ERR(uobj))
-               return uobj;
-
-       attrs->context = uobj->context;
-
-       return uobj;
-}
 
 static struct ib_uobject *
 alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
@@ -489,14 +440,12 @@ alloc_begin_idr_uobject(const struct uverbs_api_object *obj,
        ret = ib_rdmacg_try_charge(&uobj->cg_obj, uobj->context->device,
                                   RDMACG_RESOURCE_HCA_OBJECT);
        if (ret)
-               goto idr_remove;
+               goto remove;
 
        return uobj;
 
-idr_remove:
-       spin_lock(&ufile->idr_lock);
-       idr_remove(&ufile->idr, uobj->id);
-       spin_unlock(&ufile->idr_lock);
+remove:
+       xa_erase(&ufile->idr, uobj->id);
 uobj_put:
        uverbs_uobject_put(uobj);
        return ERR_PTR(ret);
@@ -526,7 +475,8 @@ alloc_begin_fd_uobject(const struct uverbs_api_object *obj,
 }
 
 struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
-                                           struct ib_uverbs_file *ufile)
+                                           struct ib_uverbs_file *ufile,
+                                           struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *ret;
 
@@ -546,6 +496,8 @@ struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
                up_read(&ufile->hw_destroy_rwsem);
                return ret;
        }
+       if (attrs)
+               attrs->context = ret->context;
        return ret;
 }
 
@@ -554,18 +506,17 @@ static void alloc_abort_idr_uobject(struct ib_uobject *uobj)
        ib_rdmacg_uncharge(&uobj->cg_obj, uobj->context->device,
                           RDMACG_RESOURCE_HCA_OBJECT);
 
-       spin_lock(&uobj->ufile->idr_lock);
-       idr_remove(&uobj->ufile->idr, uobj->id);
-       spin_unlock(&uobj->ufile->idr_lock);
+       xa_erase(&uobj->ufile->idr, uobj->id);
 }
 
 static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
-                                              enum rdma_remove_reason why)
+                                              enum rdma_remove_reason why,
+                                              struct uverbs_attr_bundle *attrs)
 {
        const struct uverbs_obj_idr_type *idr_type =
                container_of(uobj->uapi_object->type_attrs,
                             struct uverbs_obj_idr_type, type);
-       int ret = idr_type->destroy_object(uobj, why);
+       int ret = idr_type->destroy_object(uobj, why, attrs);
 
        /*
         * We can only fail gracefully if the user requested to destroy the
@@ -586,9 +537,7 @@ static int __must_check destroy_hw_idr_uobject(struct ib_uobject *uobj,
 
 static void remove_handle_idr_uobject(struct ib_uobject *uobj)
 {
-       spin_lock(&uobj->ufile->idr_lock);
-       idr_remove(&uobj->ufile->idr, uobj->id);
-       spin_unlock(&uobj->ufile->idr_lock);
+       xa_erase(&uobj->ufile->idr, uobj->id);
        /* Matches the kref in alloc_commit_idr_uobject */
        uverbs_uobject_put(uobj);
 }
@@ -599,7 +548,8 @@ static void alloc_abort_fd_uobject(struct ib_uobject *uobj)
 }
 
 static int __must_check destroy_hw_fd_uobject(struct ib_uobject *uobj,
-                                             enum rdma_remove_reason why)
+                                             enum rdma_remove_reason why,
+                                             struct uverbs_attr_bundle *attrs)
 {
        const struct uverbs_obj_fd_type *fd_type = container_of(
                uobj->uapi_object->type_attrs, struct uverbs_obj_fd_type, type);
@@ -618,17 +568,17 @@ static void remove_handle_fd_uobject(struct ib_uobject *uobj)
 static int alloc_commit_idr_uobject(struct ib_uobject *uobj)
 {
        struct ib_uverbs_file *ufile = uobj->ufile;
+       void *old;
 
-       spin_lock(&ufile->idr_lock);
        /*
         * We already allocated this IDR with a NULL object, so
         * this shouldn't fail.
         *
-        * NOTE: Once we set the IDR we loose ownership of our kref on uobj.
+        * NOTE: Storing the uobj transfers our kref on uobj to the XArray.
         * It will be put by remove_commit_idr_uobject()
         */
-       WARN_ON(idr_replace(&ufile->idr, uobj, uobj->id));
-       spin_unlock(&ufile->idr_lock);
+       old = xa_store(&ufile->idr, uobj->id, uobj, GFP_KERNEL);
+       WARN_ON(old != NULL);
 
        return 0;
 }
@@ -675,15 +625,16 @@ static int alloc_commit_fd_uobject(struct ib_uobject *uobj)
  * caller can no longer assume uobj is valid. If this function fails it
  * destroys the uboject, including the attached HW object.
  */
-int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
+int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+                                          struct uverbs_attr_bundle *attrs)
 {
-       struct ib_uverbs_file *ufile = uobj->ufile;
+       struct ib_uverbs_file *ufile = attrs->ufile;
        int ret;
 
        /* alloc_commit consumes the uobj kref */
        ret = uobj->uapi_object->type_class->alloc_commit(uobj);
        if (ret) {
-               uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+               uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
                up_read(&ufile->hw_destroy_rwsem);
                return ret;
        }
@@ -707,12 +658,13 @@ int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj)
  * This consumes the kref for uobj. It is up to the caller to unwind the HW
  * object and anything else connected to uobj before calling this.
  */
-void rdma_alloc_abort_uobject(struct ib_uobject *uobj)
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+                             struct uverbs_attr_bundle *attrs)
 {
        struct ib_uverbs_file *ufile = uobj->ufile;
 
        uobj->object = NULL;
-       uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT);
+       uverbs_destroy_uobject(uobj, RDMA_REMOVE_ABORT, attrs);
 
        /* Matches the down_read in rdma_alloc_begin_uobject */
        up_read(&ufile->hw_destroy_rwsem);
@@ -760,29 +712,28 @@ void rdma_lookup_put_uobject(struct ib_uobject *uobj,
 
 void setup_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
-       spin_lock_init(&ufile->idr_lock);
-       idr_init(&ufile->idr);
+       xa_init_flags(&ufile->idr, XA_FLAGS_ALLOC);
 }
 
 void release_ufile_idr_uobject(struct ib_uverbs_file *ufile)
 {
        struct ib_uobject *entry;
-       int id;
+       unsigned long id;
 
        /*
         * At this point uverbs_cleanup_ufile() is guaranteed to have run, and
-        * there are no HW objects left, however the IDR is still populated
+        * there are no HW objects left, however the xarray is still populated
         * with anything that has not been cleaned up by userspace. Since the
         * kref on ufile is 0, nothing is allowed to call lookup_get.
         *
         * This is an optimized equivalent to remove_handle_idr_uobject
         */
-       idr_for_each_entry(&ufile->idr, entry, id) {
+       xa_for_each(&ufile->idr, id, entry) {
                WARN_ON(entry->object);
                uverbs_uobject_put(entry);
        }
 
-       idr_destroy(&ufile->idr);
+       xa_destroy(&ufile->idr);
 }
 
 const struct uverbs_obj_type_class uverbs_idr_class = {
@@ -814,6 +765,10 @@ void uverbs_close_fd(struct file *f)
 {
        struct ib_uobject *uobj = f->private_data;
        struct ib_uverbs_file *ufile = uobj->ufile;
+       struct uverbs_attr_bundle attrs = {
+               .context = uobj->context,
+               .ufile = ufile,
+       };
 
        if (down_read_trylock(&ufile->hw_destroy_rwsem)) {
                /*
@@ -823,7 +778,7 @@ void uverbs_close_fd(struct file *f)
                 * write lock here, or we have a kernel bug.
                 */
                WARN_ON(uverbs_try_lock_object(uobj, UVERBS_LOOKUP_WRITE));
-               uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE);
+               uverbs_destroy_uobject(uobj, RDMA_REMOVE_CLOSE, &attrs);
                up_read(&ufile->hw_destroy_rwsem);
        }
 
@@ -872,6 +827,7 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
 {
        struct ib_uobject *obj, *next_obj;
        int ret = -EINVAL;
+       struct uverbs_attr_bundle attrs = { .ufile = ufile };
 
        /*
         * This shouldn't run while executing other commands on this
@@ -883,12 +839,13 @@ static int __uverbs_cleanup_ufile(struct ib_uverbs_file *ufile,
         * other threads (which might still use the FDs) chance to run.
         */
        list_for_each_entry_safe(obj, next_obj, &ufile->uobjects, list) {
+               attrs.context = obj->context;
                /*
                 * if we hit this WARN_ON, that means we are
                 * racing with a lookup_get.
                 */
                WARN_ON(uverbs_try_lock_object(obj, UVERBS_LOOKUP_WRITE));
-               if (!uverbs_destroy_uobject(obj, reason))
+               if (!uverbs_destroy_uobject(obj, reason, &attrs))
                        ret = 0;
                else
                        atomic_set(&obj->usecnt, 0);
@@ -967,26 +924,25 @@ const struct uverbs_obj_type_class uverbs_fd_class = {
 EXPORT_SYMBOL(uverbs_fd_class);
 
 struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
-                            struct ib_uverbs_file *ufile,
-                            enum uverbs_obj_access access, s64 id)
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+                            s64 id, struct uverbs_attr_bundle *attrs)
 {
        const struct uverbs_api_object *obj =
-               uapi_get_object(ufile->device->uapi, object_id);
+               uapi_get_object(attrs->ufile->device->uapi, object_id);
 
        switch (access) {
        case UVERBS_ACCESS_READ:
-               return rdma_lookup_get_uobject(obj, ufile, id,
-                                              UVERBS_LOOKUP_READ);
+               return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+                                              UVERBS_LOOKUP_READ, attrs);
        case UVERBS_ACCESS_DESTROY:
                /* Actual destruction is done inside uverbs_handle_method */
-               return rdma_lookup_get_uobject(obj, ufile, id,
-                                              UVERBS_LOOKUP_DESTROY);
+               return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+                                              UVERBS_LOOKUP_DESTROY, attrs);
        case UVERBS_ACCESS_WRITE:
-               return rdma_lookup_get_uobject(obj, ufile, id,
-                                              UVERBS_LOOKUP_WRITE);
+               return rdma_lookup_get_uobject(obj, attrs->ufile, id,
+                                              UVERBS_LOOKUP_WRITE, attrs);
        case UVERBS_ACCESS_NEW:
-               return rdma_alloc_begin_uobject(obj, ufile);
+               return rdma_alloc_begin_uobject(obj, attrs->ufile, attrs);
        default:
                WARN_ON(true);
                return ERR_PTR(-EOPNOTSUPP);
@@ -994,8 +950,8 @@ uverbs_get_uobject_from_file(u16 object_id,
 }
 
 int uverbs_finalize_object(struct ib_uobject *uobj,
-                          enum uverbs_obj_access access,
-                          bool commit)
+                          enum uverbs_obj_access access, bool commit,
+                          struct uverbs_attr_bundle *attrs)
 {
        int ret = 0;
 
@@ -1018,9 +974,9 @@ int uverbs_finalize_object(struct ib_uobject *uobj,
                break;
        case UVERBS_ACCESS_NEW:
                if (commit)
-                       ret = rdma_alloc_commit_uobject(uobj);
+                       ret = rdma_alloc_commit_uobject(uobj, attrs);
                else
-                       rdma_alloc_abort_uobject(uobj);
+                       rdma_alloc_abort_uobject(uobj, attrs);
                break;
        default:
                WARN_ON(true);
index 69f8db6..5445323 100644 (file)
@@ -48,7 +48,7 @@ struct ib_uverbs_device;
 void uverbs_destroy_ufile_hw(struct ib_uverbs_file *ufile,
                             enum rdma_remove_reason reason);
 
-int uobj_destroy(struct ib_uobject *uobj);
+int uobj_destroy(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs);
 
 /*
  * uverbs_uobject_get is called in order to increase the reference count on
@@ -83,9 +83,8 @@ void uverbs_close_fd(struct file *f);
  * uverbs_finalize_objects are called.
  */
 struct ib_uobject *
-uverbs_get_uobject_from_file(u16 object_id,
-                            struct ib_uverbs_file *ufile,
-                            enum uverbs_obj_access access, s64 id);
+uverbs_get_uobject_from_file(u16 object_id, enum uverbs_obj_access access,
+                            s64 id, struct uverbs_attr_bundle *attrs);
 
 /*
  * Note that certain finalize stages could return a status:
@@ -103,8 +102,8 @@ uverbs_get_uobject_from_file(u16 object_id,
  * object.
  */
 int uverbs_finalize_object(struct ib_uobject *uobj,
-                          enum uverbs_obj_access access,
-                          bool commit);
+                          enum uverbs_obj_access access, bool commit,
+                          struct uverbs_attr_bundle *attrs);
 
 int uverbs_output_written(const struct uverbs_attr_bundle *bundle, size_t idx);
 
index bb53495..7d8071c 100644 (file)
@@ -40,7 +40,7 @@
 #include <linux/slab.h>
 #include <linux/dma-mapping.h>
 #include <linux/kref.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 #include <uapi/linux/if_ether.h>
 #include <rdma/ib_pack.h>
@@ -183,8 +183,7 @@ static struct ib_client sa_client = {
        .remove = ib_sa_remove_one
 };
 
-static DEFINE_SPINLOCK(idr_lock);
-static DEFINE_IDR(query_idr);
+static DEFINE_XARRAY_FLAGS(queries, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 
 static DEFINE_SPINLOCK(tid_lock);
 static u32 tid;
@@ -1180,14 +1179,14 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query)
        struct ib_mad_agent *agent;
        struct ib_mad_send_buf *mad_buf;
 
-       spin_lock_irqsave(&idr_lock, flags);
-       if (idr_find(&query_idr, id) != query) {
-               spin_unlock_irqrestore(&idr_lock, flags);
+       xa_lock_irqsave(&queries, flags);
+       if (xa_load(&queries, id) != query) {
+               xa_unlock_irqrestore(&queries, flags);
                return;
        }
        agent = query->port->agent;
        mad_buf = query->mad_buf;
-       spin_unlock_irqrestore(&idr_lock, flags);
+       xa_unlock_irqrestore(&queries, flags);
 
        /*
         * If the query is still on the netlink request list, schedule
@@ -1363,21 +1362,14 @@ static void init_mad(struct ib_sa_query *query, struct ib_mad_agent *agent)
 static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
                    gfp_t gfp_mask)
 {
-       bool preload = gfpflags_allow_blocking(gfp_mask);
        unsigned long flags;
        int ret, id;
 
-       if (preload)
-               idr_preload(gfp_mask);
-       spin_lock_irqsave(&idr_lock, flags);
-
-       id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
-
-       spin_unlock_irqrestore(&idr_lock, flags);
-       if (preload)
-               idr_preload_end();
-       if (id < 0)
-               return id;
+       xa_lock_irqsave(&queries, flags);
+       ret = __xa_alloc(&queries, &id, query, xa_limit_32b, gfp_mask);
+       xa_unlock_irqrestore(&queries, flags);
+       if (ret < 0)
+               return ret;
 
        query->mad_buf->timeout_ms  = timeout_ms;
        query->mad_buf->context[0] = query;
@@ -1394,9 +1386,9 @@ static int send_mad(struct ib_sa_query *query, unsigned long timeout_ms,
 
        ret = ib_post_send_mad(query->mad_buf, NULL);
        if (ret) {
-               spin_lock_irqsave(&idr_lock, flags);
-               idr_remove(&query_idr, id);
-               spin_unlock_irqrestore(&idr_lock, flags);
+               xa_lock_irqsave(&queries, flags);
+               __xa_erase(&queries, id);
+               xa_unlock_irqrestore(&queries, flags);
        }
 
        /*
@@ -2188,9 +2180,9 @@ static void send_handler(struct ib_mad_agent *agent,
                        break;
                }
 
-       spin_lock_irqsave(&idr_lock, flags);
-       idr_remove(&query_idr, query->id);
-       spin_unlock_irqrestore(&idr_lock, flags);
+       xa_lock_irqsave(&queries, flags);
+       __xa_erase(&queries, query->id);
+       xa_unlock_irqrestore(&queries, flags);
 
        free_mad(query);
        if (query->client)
@@ -2475,5 +2467,5 @@ void ib_sa_cleanup(void)
        destroy_workqueue(ib_nl_wq);
        mcast_cleanup();
        ib_unregister_client(&sa_client);
-       idr_destroy(&query_idr);
+       WARN_ON(!xa_empty(&queries));
 }
index 9b6a065..c78d0c9 100644 (file)
@@ -349,10 +349,15 @@ static struct attribute *port_default_attrs[] = {
 
 static size_t print_ndev(const struct ib_gid_attr *gid_attr, char *buf)
 {
-       if (!gid_attr->ndev)
-               return -EINVAL;
-
-       return sprintf(buf, "%s\n", gid_attr->ndev->name);
+       struct net_device *ndev;
+       size_t ret = -EINVAL;
+
+       rcu_read_lock();
+       ndev = rcu_dereference(gid_attr->ndev);
+       if (ndev)
+               ret = sprintf(buf, "%s\n", ndev->name);
+       rcu_read_unlock();
+       return ret;
 }
 
 static size_t print_gid_type(const struct ib_gid_attr *gid_attr, char *buf)
@@ -1015,8 +1020,10 @@ err_free_stats:
        return;
 }
 
-static int add_port(struct ib_device *device, int port_num)
+static int add_port(struct ib_core_device *coredev, int port_num)
 {
+       struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
+       bool is_full_dev = &device->coredev == coredev;
        struct ib_port *p;
        struct ib_port_attr attr;
        int i;
@@ -1034,7 +1041,7 @@ static int add_port(struct ib_device *device, int port_num)
        p->port_num   = port_num;
 
        ret = kobject_init_and_add(&p->kobj, &port_type,
-                                  device->ports_kobj,
+                                  coredev->ports_kobj,
                                   "%d", port_num);
        if (ret) {
                kfree(p);
@@ -1055,7 +1062,7 @@ static int add_port(struct ib_device *device, int port_num)
                goto err_put;
        }
 
-       if (device->ops.process_mad) {
+       if (device->ops.process_mad && is_full_dev) {
                p->pma_table = get_counter_table(device, port_num);
                ret = sysfs_create_group(&p->kobj, p->pma_table);
                if (ret)
@@ -1111,7 +1118,7 @@ static int add_port(struct ib_device *device, int port_num)
        if (ret)
                goto err_free_pkey;
 
-       if (device->ops.init_port) {
+       if (device->ops.init_port && is_full_dev) {
                ret = device->ops.init_port(device, port_num, &p->kobj);
                if (ret)
                        goto err_remove_pkey;
@@ -1122,10 +1129,10 @@ static int add_port(struct ib_device *device, int port_num)
         * port, so holder should be device. Therefore skip per port conunter
         * initialization.
         */
-       if (device->ops.alloc_hw_stats && port_num)
+       if (device->ops.alloc_hw_stats && port_num && is_full_dev)
                setup_hw_stats(device, p, port_num);
 
-       list_add_tail(&p->kobj.entry, &device->port_list);
+       list_add_tail(&p->kobj.entry, &coredev->port_list);
 
        kobject_uevent(&p->kobj, KOBJ_ADD);
        return 0;
@@ -1194,6 +1201,7 @@ static ssize_t node_type_show(struct device *device,
        case RDMA_NODE_RNIC:      return sprintf(buf, "%d: RNIC\n", dev->node_type);
        case RDMA_NODE_USNIC:     return sprintf(buf, "%d: usNIC\n", dev->node_type);
        case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+       case RDMA_NODE_UNSPECIFIED: return sprintf(buf, "%d: unspecified\n", dev->node_type);
        case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
        case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
        default:                  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
@@ -1279,11 +1287,11 @@ const struct attribute_group ib_dev_attr_group = {
        .attrs = ib_dev_attrs,
 };
 
-static void ib_free_port_attrs(struct ib_device *device)
+void ib_free_port_attrs(struct ib_core_device *coredev)
 {
        struct kobject *p, *t;
 
-       list_for_each_entry_safe(p, t, &device->port_list, entry) {
+       list_for_each_entry_safe(p, t, &coredev->port_list, entry) {
                struct ib_port *port = container_of(p, struct ib_port, kobj);
 
                list_del(&p->entry);
@@ -1303,20 +1311,22 @@ static void ib_free_port_attrs(struct ib_device *device)
                kobject_put(p);
        }
 
-       kobject_put(device->ports_kobj);
+       kobject_put(coredev->ports_kobj);
 }
 
-static int ib_setup_port_attrs(struct ib_device *device)
+int ib_setup_port_attrs(struct ib_core_device *coredev)
 {
+       struct ib_device *device = rdma_device_to_ibdev(&coredev->dev);
        unsigned int port;
        int ret;
 
-       device->ports_kobj = kobject_create_and_add("ports", &device->dev.kobj);
-       if (!device->ports_kobj)
+       coredev->ports_kobj = kobject_create_and_add("ports",
+                                                    &coredev->dev.kobj);
+       if (!coredev->ports_kobj)
                return -ENOMEM;
 
        rdma_for_each_port (device, port) {
-               ret = add_port(device, port);
+               ret = add_port(coredev, port);
                if (ret)
                        goto err_put;
        }
@@ -1324,7 +1334,7 @@ static int ib_setup_port_attrs(struct ib_device *device)
        return 0;
 
 err_put:
-       ib_free_port_attrs(device);
+       ib_free_port_attrs(coredev);
        return ret;
 }
 
@@ -1332,7 +1342,7 @@ int ib_device_register_sysfs(struct ib_device *device)
 {
        int ret;
 
-       ret = ib_setup_port_attrs(device);
+       ret = ib_setup_port_attrs(&device->coredev);
        if (ret)
                return ret;
 
@@ -1348,5 +1358,48 @@ void ib_device_unregister_sysfs(struct ib_device *device)
                free_hsag(&device->dev.kobj, device->hw_stats_ag);
        kfree(device->hw_stats);
 
-       ib_free_port_attrs(device);
+       ib_free_port_attrs(&device->coredev);
+}
+
+/**
+ * ib_port_register_module_stat - add module counters under relevant port
+ *  of IB device.
+ *
+ * @device: IB device to add counters
+ * @port_num: valid port number
+ * @kobj: pointer to the kobject to initialize
+ * @ktype: pointer to the ktype for this kobject.
+ * @name: the name of the kobject
+ */
+int ib_port_register_module_stat(struct ib_device *device, u8 port_num,
+                                struct kobject *kobj, struct kobj_type *ktype,
+                                const char *name)
+{
+       struct kobject *p, *t;
+       int ret;
+
+       list_for_each_entry_safe(p, t, &device->coredev.port_list, entry) {
+               struct ib_port *port = container_of(p, struct ib_port, kobj);
+
+               if (port->port_num != port_num)
+                       continue;
+
+               ret = kobject_init_and_add(kobj, ktype, &port->kobj, "%s",
+                                          name);
+               if (ret)
+                       return ret;
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(ib_port_register_module_stat);
+
+/**
+ * ib_port_unregister_module_stat - release module counters
+ * @kobj: pointer to the kobject to release
+ */
+void ib_port_unregister_module_stat(struct kobject *kobj)
+{
+       kobject_put(kobj);
 }
+EXPORT_SYMBOL(ib_port_unregister_module_stat);
index 65c3230..8e7da2d 100644 (file)
@@ -42,7 +42,7 @@
 #include <linux/file.h>
 #include <linux/mount.h>
 #include <linux/cdev.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 
@@ -125,23 +125,22 @@ static struct ib_client ucm_client = {
        .remove = ib_ucm_remove_one
 };
 
-static DEFINE_MUTEX(ctx_id_mutex);
-static DEFINE_IDR(ctx_id_table);
+static DEFINE_XARRAY_ALLOC(ctx_id_table);
 static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
 
 static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
 {
        struct ib_ucm_context *ctx;
 
-       mutex_lock(&ctx_id_mutex);
-       ctx = idr_find(&ctx_id_table, id);
+       xa_lock(&ctx_id_table);
+       ctx = xa_load(&ctx_id_table, id);
        if (!ctx)
                ctx = ERR_PTR(-ENOENT);
        else if (ctx->file != file)
                ctx = ERR_PTR(-EINVAL);
        else
                atomic_inc(&ctx->ref);
-       mutex_unlock(&ctx_id_mutex);
+       xa_unlock(&ctx_id_table);
 
        return ctx;
 }
@@ -194,10 +193,7 @@ static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
        ctx->file = file;
        INIT_LIST_HEAD(&ctx->events);
 
-       mutex_lock(&ctx_id_mutex);
-       ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
-       mutex_unlock(&ctx_id_mutex);
-       if (ctx->id < 0)
+       if (xa_alloc(&ctx_id_table, &ctx->id, ctx, xa_limit_32b, GFP_KERNEL))
                goto error;
 
        list_add_tail(&ctx->file_list, &file->ctxs);
@@ -514,9 +510,7 @@ static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
 err2:
        ib_destroy_cm_id(ctx->cm_id);
 err1:
-       mutex_lock(&ctx_id_mutex);
-       idr_remove(&ctx_id_table, ctx->id);
-       mutex_unlock(&ctx_id_mutex);
+       xa_erase(&ctx_id_table, ctx->id);
        kfree(ctx);
        return result;
 }
@@ -536,15 +530,15 @@ static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
        if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
                return -EFAULT;
 
-       mutex_lock(&ctx_id_mutex);
-       ctx = idr_find(&ctx_id_table, cmd.id);
+       xa_lock(&ctx_id_table);
+       ctx = xa_load(&ctx_id_table, cmd.id);
        if (!ctx)
                ctx = ERR_PTR(-ENOENT);
        else if (ctx->file != file)
                ctx = ERR_PTR(-EINVAL);
        else
-               idr_remove(&ctx_id_table, ctx->id);
-       mutex_unlock(&ctx_id_mutex);
+               __xa_erase(&ctx_id_table, ctx->id);
+       xa_unlock(&ctx_id_table);
 
        if (IS_ERR(ctx))
                return PTR_ERR(ctx);
@@ -1189,10 +1183,7 @@ static int ib_ucm_close(struct inode *inode, struct file *filp)
                                 struct ib_ucm_context, file_list);
                mutex_unlock(&file->file_mutex);
 
-               mutex_lock(&ctx_id_mutex);
-               idr_remove(&ctx_id_table, ctx->id);
-               mutex_unlock(&ctx_id_mutex);
-
+               xa_erase(&ctx_id_table, ctx->id);
                ib_destroy_cm_id(ctx->cm_id);
                ib_ucm_cleanup_events(ctx);
                kfree(ctx);
@@ -1352,7 +1343,7 @@ static void __exit ib_ucm_cleanup(void)
        class_remove_file(&cm_class, &class_attr_abi_version.attr);
        unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_NUM_FIXED_MINOR);
        unregister_chrdev_region(dynamic_ucm_dev, IB_UCM_NUM_DYNAMIC_MINOR);
-       idr_destroy(&ctx_id_table);
+       WARN_ON(!xa_empty(&ctx_id_table));
 }
 
 module_init(ib_ucm_init);
index fe55515..0a23048 100644 (file)
 #include <linux/sched/signal.h>
 #include <linux/sched/mm.h>
 #include <linux/export.h>
-#include <linux/hugetlb.h>
 #include <linux/slab.h>
+#include <linux/pagemap.h>
 #include <rdma/ib_umem_odp.h>
 
 #include "uverbs.h"
 
-
 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
 {
-       struct scatterlist *sg;
+       struct sg_page_iter sg_iter;
        struct page *page;
-       int i;
 
        if (umem->nmap > 0)
-               ib_dma_unmap_sg(dev, umem->sg_head.sgl,
-                               umem->npages,
+               ib_dma_unmap_sg(dev, umem->sg_head.sgl, umem->sg_nents,
                                DMA_BIDIRECTIONAL);
 
-       for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
-
-               page = sg_page(sg);
+       for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->sg_nents, 0) {
+               page = sg_page_iter_page(&sg_iter);
                if (!PageDirty(page) && umem->writable && dirty)
                        set_page_dirty_lock(page);
                put_page(page);
@@ -66,6 +62,124 @@ static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int d
        sg_free_table(&umem->sg_head);
 }
 
+/* ib_umem_add_sg_table - Add N contiguous pages to scatter table
+ *
+ * sg: current scatterlist entry
+ * page_list: array of npage struct page pointers
+ * npages: number of pages in page_list
+ * max_seg_sz: maximum segment size in bytes
+ * nents: [out] number of entries in the scatterlist
+ *
+ * Return new end of scatterlist
+ */
+static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
+                                               struct page **page_list,
+                                               unsigned long npages,
+                                               unsigned int max_seg_sz,
+                                               int *nents)
+{
+       unsigned long first_pfn;
+       unsigned long i = 0;
+       bool update_cur_sg = false;
+       bool first = !sg_page(sg);
+
+       /* Check if new page_list is contiguous with end of previous page_list.
+        * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
+        */
+       if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
+                      page_to_pfn(page_list[0])))
+               update_cur_sg = true;
+
+       while (i != npages) {
+               unsigned long len;
+               struct page *first_page = page_list[i];
+
+               first_pfn = page_to_pfn(first_page);
+
+               /* Compute the number of contiguous pages we have starting
+                * at i
+                */
+               for (len = 0; i != npages &&
+                             first_pfn + len == page_to_pfn(page_list[i]) &&
+                             len < (max_seg_sz >> PAGE_SHIFT);
+                    len++)
+                       i++;
+
+               /* Squash N contiguous pages from page_list into current sge */
+               if (update_cur_sg) {
+                       if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
+                               sg_set_page(sg, sg_page(sg),
+                                           sg->length + (len << PAGE_SHIFT),
+                                           0);
+                               update_cur_sg = false;
+                               continue;
+                       }
+                       update_cur_sg = false;
+               }
+
+               /* Squash N contiguous pages into next sge or first sge */
+               if (!first)
+                       sg = sg_next(sg);
+
+               (*nents)++;
+               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
+               first = false;
+       }
+
+       return sg;
+}
+
+/**
+ * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
+ *
+ * @umem: umem struct
+ * @pgsz_bitmap: bitmap of HW supported page sizes
+ * @virt: IOVA
+ *
+ * This helper is intended for HW that support multiple page
+ * sizes but can do only a single page size in an MR.
+ *
+ * Returns 0 if the umem requires page sizes not supported by
+ * the driver to be mapped. Drivers always supporting PAGE_SIZE
+ * or smaller will never see a 0 result.
+ */
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+                                    unsigned long pgsz_bitmap,
+                                    unsigned long virt)
+{
+       struct scatterlist *sg;
+       unsigned int best_pg_bit;
+       unsigned long va, pgoff;
+       dma_addr_t mask;
+       int i;
+
+       /* At minimum, drivers must support PAGE_SIZE or smaller */
+       if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
+               return 0;
+
+       va = virt;
+       /* max page size not to exceed MR length */
+       mask = roundup_pow_of_two(umem->length);
+       /* offset into first SGL */
+       pgoff = umem->address & ~PAGE_MASK;
+
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
+               /* Walk SGL and reduce max page size if VA/PA bits differ
+                * for any address.
+                */
+               mask |= (sg_dma_address(sg) + pgoff) ^ va;
+               if (i && i != (umem->nmap - 1))
+                       /* restrict by length as well for interior SGEs */
+                       mask |= sg_dma_len(sg);
+               va += sg_dma_len(sg) - pgoff;
+               pgoff = 0;
+       }
+       best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);
+
+       return BIT_ULL(best_pg_bit);
+}
+EXPORT_SYMBOL(ib_umem_find_best_pgsz);
+
 /**
  * ib_umem_get - Pin and DMA map userspace memory.
  *
@@ -84,16 +198,14 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        struct ib_ucontext *context;
        struct ib_umem *umem;
        struct page **page_list;
-       struct vm_area_struct **vma_list;
        unsigned long lock_limit;
        unsigned long new_pinned;
        unsigned long cur_base;
        struct mm_struct *mm;
        unsigned long npages;
        int ret;
-       int i;
        unsigned long dma_attrs = 0;
-       struct scatterlist *sg, *sg_list_start;
+       struct scatterlist *sg;
        unsigned int gup_flags = FOLL_WRITE;
 
        if (!udata)
@@ -138,29 +250,23 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        mmgrab(mm);
 
        if (access & IB_ACCESS_ON_DEMAND) {
+               if (WARN_ON_ONCE(!context->invalidate_range)) {
+                       ret = -EINVAL;
+                       goto umem_kfree;
+               }
+
                ret = ib_umem_odp_get(to_ib_umem_odp(umem), access);
                if (ret)
                        goto umem_kfree;
                return umem;
        }
 
-       /* We assume the memory is from hugetlb until proved otherwise */
-       umem->hugetlb   = 1;
-
        page_list = (struct page **) __get_free_page(GFP_KERNEL);
        if (!page_list) {
                ret = -ENOMEM;
                goto umem_kfree;
        }
 
-       /*
-        * if we can't alloc the vma_list, it's not so bad;
-        * just assume the memory is not hugetlb memory
-        */
-       vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
-       if (!vma_list)
-               umem->hugetlb = 0;
-
        npages = ib_umem_num_pages(umem);
        if (npages == 0 || npages > UINT_MAX) {
                ret = -EINVAL;
@@ -185,41 +291,34 @@ struct ib_umem *ib_umem_get(struct ib_udata *udata, unsigned long addr,
        if (!umem->writable)
                gup_flags |= FOLL_FORCE;
 
-       sg_list_start = umem->sg_head.sgl;
+       sg = umem->sg_head.sgl;
 
        while (npages) {
                down_read(&mm->mmap_sem);
                ret = get_user_pages_longterm(cur_base,
                                     min_t(unsigned long, npages,
                                           PAGE_SIZE / sizeof (struct page *)),
-                                    gup_flags, page_list, vma_list);
+                                    gup_flags, page_list, NULL);
                if (ret < 0) {
                        up_read(&mm->mmap_sem);
                        goto umem_release;
                }
 
-               umem->npages += ret;
                cur_base += ret * PAGE_SIZE;
                npages   -= ret;
 
-               /* Continue to hold the mmap_sem as vma_list access
-                * needs to be protected.
-                */
-               for_each_sg(sg_list_start, sg, ret, i) {
-                       if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
-                               umem->hugetlb = 0;
+               sg = ib_umem_add_sg_table(sg, page_list, ret,
+                       dma_get_max_seg_size(context->device->dma_device),
+                       &umem->sg_nents);
 
-                       sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
-               }
                up_read(&mm->mmap_sem);
-
-               /* preparing for next loop */
-               sg_list_start = sg;
        }
 
+       sg_mark_end(sg);
+
        umem->nmap = ib_dma_map_sg_attrs(context->device,
                                  umem->sg_head.sgl,
-                                 umem->npages,
+                                 umem->sg_nents,
                                  DMA_BIDIRECTIONAL,
                                  dma_attrs);
 
@@ -236,8 +335,6 @@ umem_release:
 vma:
        atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
 out:
-       if (vma_list)
-               free_page((unsigned long) vma_list);
        free_page((unsigned long) page_list);
 umem_kfree:
        if (ret) {
@@ -315,7 +412,7 @@ int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
                return -EINVAL;
        }
 
-       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->npages, dst, length,
+       ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->sg_nents, dst, length,
                                 offset + ib_umem_offset(umem));
 
        if (ret < 0)
index e6ec79a..c7226cf 100644 (file)
@@ -241,7 +241,7 @@ static struct ib_ucontext_per_mm *alloc_per_mm(struct ib_ucontext *ctx,
        per_mm->mm = mm;
        per_mm->umem_tree = RB_ROOT_CACHED;
        init_rwsem(&per_mm->umem_rwsem);
-       per_mm->active = ctx->invalidate_range;
+       per_mm->active = true;
 
        rcu_read_lock();
        per_mm->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
@@ -417,9 +417,6 @@ int ib_umem_odp_get(struct ib_umem_odp *umem_odp, int access)
                h = hstate_vma(vma);
                umem->page_shift = huge_page_shift(h);
                up_read(&mm->mmap_sem);
-               umem->hugetlb = 1;
-       } else {
-               umem->hugetlb = 0;
        }
 
        mutex_init(&umem_odp->umem_mutex);
@@ -503,7 +500,6 @@ static int ib_umem_odp_map_dma_single_page(
        struct ib_umem *umem = &umem_odp->umem;
        struct ib_device *dev = umem->context->device;
        dma_addr_t dma_addr;
-       int stored_page = 0;
        int remove_existing_mapping = 0;
        int ret = 0;
 
@@ -527,8 +523,7 @@ static int ib_umem_odp_map_dma_single_page(
                }
                umem_odp->dma_list[page_index] = dma_addr | access_mask;
                umem_odp->page_list[page_index] = page;
-               umem->npages++;
-               stored_page = 1;
+               umem_odp->npages++;
        } else if (umem_odp->page_list[page_index] == page) {
                umem_odp->dma_list[page_index] |= access_mask;
        } else {
@@ -540,11 +535,9 @@ static int ib_umem_odp_map_dma_single_page(
        }
 
 out:
-       /* On Demand Paging - avoid pinning the page */
-       if (umem->context->invalidate_range || !stored_page)
-               put_page(page);
+       put_page(page);
 
-       if (remove_existing_mapping && umem->context->invalidate_range) {
+       if (remove_existing_mapping) {
                ib_umem_notifier_start_account(umem_odp);
                umem->context->invalidate_range(
                        umem_odp,
@@ -754,12 +747,9 @@ void ib_umem_odp_unmap_dma_pages(struct ib_umem_odp *umem_odp, u64 virt,
                                 */
                                set_page_dirty(head_page);
                        }
-                       /* on demand pinning support */
-                       if (!umem->context->invalidate_range)
-                               put_page(page);
                        umem_odp->page_list[idx] = NULL;
                        umem_odp->dma_list[idx] = 0;
-                       umem->npages--;
+                       umem_odp->npages--;
                }
        }
        mutex_unlock(&umem_odp->umem_mutex);
index b58b07c..671f07b 100644 (file)
@@ -129,6 +129,9 @@ struct ib_umad_packet {
        struct ib_user_mad mad;
 };
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/ib_umad.h>
+
 static const dev_t base_umad_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
 static const dev_t base_issm_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE) +
                                   IB_UMAD_NUM_FIXED_MINOR;
@@ -334,6 +337,9 @@ static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
                                return -EFAULT;
                }
        }
+
+       trace_ib_umad_read_recv(file, &packet->mad.hdr, &recv_buf->mad->mad_hdr);
+
        return hdr_size(file) + packet->length;
 }
 
@@ -353,6 +359,9 @@ static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
        if (copy_to_user(buf, packet->mad.data, packet->length))
                return -EFAULT;
 
+       trace_ib_umad_read_send(file, &packet->mad.hdr,
+                               (struct ib_mad_hdr *)&packet->mad.data);
+
        return size;
 }
 
@@ -508,6 +517,9 @@ static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
 
        mutex_lock(&file->mutex);
 
+       trace_ib_umad_write(file, &packet->mad.hdr,
+                           (struct ib_mad_hdr *)&packet->mad.data);
+
        agent = __get_agent(file, packet->mad.hdr.id);
        if (!agent) {
                ret = -EINVAL;
@@ -968,6 +980,11 @@ static int ib_umad_open(struct inode *inode, struct file *filp)
                goto out;
        }
 
+       if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+               ret = -EPERM;
+               goto out;
+       }
+
        file = kzalloc(sizeof(*file), GFP_KERNEL);
        if (!file) {
                ret = -ENOMEM;
@@ -1061,6 +1078,11 @@ static int ib_umad_sm_open(struct inode *inode, struct file *filp)
                }
        }
 
+       if (!rdma_dev_access_netns(port->ib_dev, current->nsproxy->net_ns)) {
+               ret = -EPERM;
+               goto err_up_sem;
+       }
+
        ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
        if (ret)
                goto err_up_sem;
index 32cc8fe..1e5aeb3 100644 (file)
@@ -162,9 +162,7 @@ struct ib_uverbs_file {
        struct list_head umaps;
        struct page *disassociate_page;
 
-       struct idr              idr;
-       /* spinlock protects write access to idr */
-       spinlock_t              idr_lock;
+       struct xarray           idr;
 };
 
 struct ib_uverbs_event {
@@ -241,7 +239,8 @@ void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
 void ib_uverbs_event_handler(struct ib_event_handler *handler,
                             struct ib_event *event);
 int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
-                          enum rdma_remove_reason why);
+                          enum rdma_remove_reason why,
+                          struct uverbs_attr_bundle *attrs);
 
 int uverbs_dealloc_mw(struct ib_mw *mw);
 void ib_uverbs_detach_umcast(struct ib_qp *qp,
index 062a86c..5a3a178 100644 (file)
@@ -162,7 +162,7 @@ static const void __user *uverbs_request_next_ptr(struct uverbs_req_iter *iter,
        const void __user *res = iter->cur;
 
        if (iter->cur + len > iter->end)
-               return ERR_PTR(-ENOSPC);
+               return (void __force __user *)ERR_PTR(-ENOSPC);
        iter->cur += len;
        return res;
 }
@@ -175,7 +175,7 @@ static int uverbs_request_finish(struct uverbs_req_iter *iter)
 }
 
 static struct ib_uverbs_completion_event_file *
-_ib_uverbs_lookup_comp_file(s32 fd, const struct uverbs_attr_bundle *attrs)
+_ib_uverbs_lookup_comp_file(s32 fd, struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *uobj = ufd_get_read(UVERBS_OBJECT_COMP_CHANNEL,
                                               fd, attrs);
@@ -230,6 +230,8 @@ static int ib_uverbs_get_context(struct uverbs_attr_bundle *attrs)
                goto err_alloc;
        }
 
+       attrs->context = ucontext;
+
        ucontext->res.type = RDMA_RESTRACK_CTX;
        ucontext->device = ib_dev;
        ucontext->cg_obj = cg_obj;
@@ -423,7 +425,7 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
        atomic_set(&pd->usecnt, 0);
        pd->res.type = RDMA_RESTRACK_PD;
 
-       ret = ib_dev->ops.alloc_pd(pd, uobj->context, &attrs->driver_udata);
+       ret = ib_dev->ops.alloc_pd(pd, &attrs->driver_udata);
        if (ret)
                goto err_alloc;
 
@@ -436,15 +438,15 @@ static int ib_uverbs_alloc_pd(struct uverbs_attr_bundle *attrs)
        if (ret)
                goto err_copy;
 
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
-       ib_dealloc_pd(pd);
+       ib_dealloc_pd_user(pd, &attrs->driver_udata);
        pd = NULL;
 err_alloc:
        kfree(pd);
 err:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
        return ret;
 }
 
@@ -594,8 +596,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
        }
 
        if (!xrcd) {
-               xrcd = ib_dev->ops.alloc_xrcd(ib_dev, obj->uobject.context,
-                                             &attrs->driver_udata);
+               xrcd = ib_dev->ops.alloc_xrcd(ib_dev, &attrs->driver_udata);
                if (IS_ERR(xrcd)) {
                        ret = PTR_ERR(xrcd);
                        goto err;
@@ -633,7 +634,7 @@ static int ib_uverbs_open_xrcd(struct uverbs_attr_bundle *attrs)
 
        mutex_unlock(&ibudev->xrcd_tree_mutex);
 
-       return uobj_alloc_commit(&obj->uobject);
+       return uobj_alloc_commit(&obj->uobject, attrs);
 
 err_copy:
        if (inode) {
@@ -643,10 +644,10 @@ err_copy:
        }
 
 err_dealloc_xrcd:
-       ib_dealloc_xrcd(xrcd);
+       ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
 
 err:
-       uobj_alloc_abort(&obj->uobject);
+       uobj_alloc_abort(&obj->uobject, attrs);
 
 err_tree_mutex_unlock:
        if (f.file)
@@ -669,19 +670,19 @@ static int ib_uverbs_close_xrcd(struct uverbs_attr_bundle *attrs)
        return uobj_perform_destroy(UVERBS_OBJECT_XRCD, cmd.xrcd_handle, attrs);
 }
 
-int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject,
-                          struct ib_xrcd *xrcd,
-                          enum rdma_remove_reason why)
+int ib_uverbs_dealloc_xrcd(struct ib_uobject *uobject, struct ib_xrcd *xrcd,
+                          enum rdma_remove_reason why,
+                          struct uverbs_attr_bundle *attrs)
 {
        struct inode *inode;
        int ret;
-       struct ib_uverbs_device *dev = uobject->context->ufile->device;
+       struct ib_uverbs_device *dev = attrs->ufile->device;
 
        inode = xrcd->inode;
        if (inode && !atomic_dec_and_test(&xrcd->usecnt))
                return 0;
 
-       ret = ib_dealloc_xrcd(xrcd);
+       ret = ib_dealloc_xrcd(xrcd, &attrs->driver_udata);
 
        if (ib_is_destroy_retryable(ret, why, uobject)) {
                atomic_inc(&xrcd->usecnt);
@@ -763,16 +764,16 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs)
 
        uobj_put_obj_read(pd);
 
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
-       ib_dereg_mr(mr);
+       ib_dereg_mr_user(mr, &attrs->driver_udata);
 
 err_put:
        uobj_put_obj_read(pd);
 
 err_free:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
        return ret;
 }
 
@@ -917,14 +918,14 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs)
                goto err_copy;
 
        uobj_put_obj_read(pd);
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
        uverbs_dealloc_mw(mw);
 err_put:
        uobj_put_obj_read(pd);
 err_free:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
        return ret;
 }
 
@@ -965,11 +966,11 @@ static int ib_uverbs_create_comp_channel(struct uverbs_attr_bundle *attrs)
 
        ret = uverbs_response(attrs, &resp, sizeof(resp));
        if (ret) {
-               uobj_alloc_abort(uobj);
+               uobj_alloc_abort(uobj, attrs);
                return ret;
        }
 
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 }
 
 static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
@@ -1009,8 +1010,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        attr.comp_vector = cmd->comp_vector;
        attr.flags = cmd->flags;
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
-                                  &attrs->driver_udata);
+       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
        if (IS_ERR(cq)) {
                ret = PTR_ERR(cq);
                goto err_file;
@@ -1036,7 +1036,7 @@ static struct ib_ucq_object *create_cq(struct uverbs_attr_bundle *attrs,
        if (ret)
                goto err_cb;
 
-       ret = uobj_alloc_commit(&obj->uobject);
+       ret = uobj_alloc_commit(&obj->uobject, attrs);
        if (ret)
                return ERR_PTR(ret);
        return obj;
@@ -1049,7 +1049,7 @@ err_file:
                ib_uverbs_release_ucq(attrs->ufile, ev_file, obj);
 
 err:
-       uobj_alloc_abort(&obj->uobject);
+       uobj_alloc_abort(&obj->uobject, attrs);
 
        return ERR_PTR(ret);
 }
@@ -1418,7 +1418,6 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
                if (ret)
                        goto err_cb;
 
-               qp->real_qp       = qp;
                qp->pd            = pd;
                qp->send_cq       = attr.send_cq;
                qp->recv_cq       = attr.recv_cq;
@@ -1477,7 +1476,7 @@ static int create_qp(struct uverbs_attr_bundle *attrs,
        if (ind_tbl)
                uobj_put_obj_read(ind_tbl);
 
-       return uobj_alloc_commit(&obj->uevent.uobject);
+       return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 err_cb:
        ib_destroy_qp(qp);
 
@@ -1495,7 +1494,7 @@ err_put:
        if (ind_tbl)
                uobj_put_obj_read(ind_tbl);
 
-       uobj_alloc_abort(&obj->uevent.uobject);
+       uobj_alloc_abort(&obj->uevent.uobject, attrs);
        return ret;
 }
 
@@ -1609,14 +1608,14 @@ static int ib_uverbs_open_qp(struct uverbs_attr_bundle *attrs)
        qp->uobject = &obj->uevent.uobject;
        uobj_put_read(xrcd_uobj);
 
-       return uobj_alloc_commit(&obj->uevent.uobject);
+       return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_destroy:
        ib_destroy_qp(qp);
 err_xrcd:
        uobj_put_read(xrcd_uobj);
 err_put:
-       uobj_alloc_abort(&obj->uevent.uobject);
+       uobj_alloc_abort(&obj->uevent.uobject, attrs);
        return ret;
 }
 
@@ -2451,7 +2450,7 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs)
                goto err_copy;
 
        uobj_put_obj_read(pd);
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
        rdma_destroy_ah(ah, RDMA_DESTROY_AH_SLEEPABLE);
@@ -2460,7 +2459,7 @@ err_put:
        uobj_put_obj_read(pd);
 
 err:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
        return ret;
 }
 
@@ -2962,16 +2961,16 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs)
 
        uobj_put_obj_read(pd);
        uobj_put_obj_read(cq);
-       return uobj_alloc_commit(&obj->uevent.uobject);
+       return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_copy:
-       ib_destroy_wq(wq);
+       ib_destroy_wq(wq, &attrs->driver_udata);
 err_put_cq:
        uobj_put_obj_read(cq);
 err_put_pd:
        uobj_put_obj_read(pd);
 err_uobj:
-       uobj_alloc_abort(&obj->uevent.uobject);
+       uobj_alloc_abort(&obj->uevent.uobject, attrs);
 
        return err;
 }
@@ -3136,12 +3135,12 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs)
        for (j = 0; j < num_read_wqs; j++)
                uobj_put_obj_read(wqs[j]);
 
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 
 err_copy:
        ib_destroy_rwq_ind_table(rwq_ind_tbl);
 err_uobj:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
 put_wqs:
        for (j = 0; j < num_read_wqs; j++)
                uobj_put_obj_read(wqs[j]);
@@ -3314,7 +3313,7 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs)
        kfree(flow_attr);
        if (cmd.flow_attr.num_of_specs)
                kfree(kern_flow_attr);
-       return uobj_alloc_commit(uobj);
+       return uobj_alloc_commit(uobj, attrs);
 err_copy:
        if (!qp->device->ops.destroy_flow(flow_id))
                atomic_dec(&qp->usecnt);
@@ -3325,7 +3324,7 @@ err_free_flow_attr:
 err_put:
        uobj_put_obj_read(qp);
 err_uobj:
-       uobj_alloc_abort(uobj);
+       uobj_alloc_abort(uobj, attrs);
 err_free_attr:
        if (cmd.flow_attr.num_of_specs)
                kfree(kern_flow_attr);
@@ -3411,9 +3410,9 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
        obj->uevent.events_reported = 0;
        INIT_LIST_HEAD(&obj->uevent.event_list);
 
-       srq = pd->device->ops.create_srq(pd, &attr, udata);
-       if (IS_ERR(srq)) {
-               ret = PTR_ERR(srq);
+       srq = rdma_zalloc_drv_obj(ib_dev, ib_srq);
+       if (!srq) {
+               ret = -ENOMEM;
                goto err_put;
        }
 
@@ -3424,6 +3423,10 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
        srq->event_handler = attr.event_handler;
        srq->srq_context   = attr.srq_context;
 
+       ret = pd->device->ops.create_srq(srq, &attr, udata);
+       if (ret)
+               goto err_free;
+
        if (ib_srq_has_cq(cmd->srq_type)) {
                srq->ext.cq       = attr.ext.cq;
                atomic_inc(&attr.ext.cq->usecnt);
@@ -3458,11 +3461,13 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs,
                uobj_put_obj_read(attr.ext.cq);
 
        uobj_put_obj_read(pd);
-       return uobj_alloc_commit(&obj->uevent.uobject);
+       return uobj_alloc_commit(&obj->uevent.uobject, attrs);
 
 err_copy:
-       ib_destroy_srq(srq);
+       ib_destroy_srq_user(srq, &attrs->driver_udata);
 
+err_free:
+       kfree(srq);
 err_put:
        uobj_put_obj_read(pd);
 
@@ -3477,7 +3482,7 @@ err_put_xrcd:
        }
 
 err:
-       uobj_alloc_abort(&obj->uevent.uobject);
+       uobj_alloc_abort(&obj->uevent.uobject, attrs);
        return ret;
 }
 
index e137994..829b0c6 100644 (file)
@@ -207,13 +207,12 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
 
        for (i = 0; i != array_len; i++) {
                attr->uobjects[i] = uverbs_get_uobject_from_file(
-                       spec->u2.objs_arr.obj_type, pbundle->bundle.ufile,
-                       spec->u2.objs_arr.access, idr_vals[i]);
+                       spec->u2.objs_arr.obj_type, spec->u2.objs_arr.access,
+                       idr_vals[i], &pbundle->bundle);
                if (IS_ERR(attr->uobjects[i])) {
                        ret = PTR_ERR(attr->uobjects[i]);
                        break;
                }
-               pbundle->bundle.context = attr->uobjects[i]->context;
        }
 
        attr->len = i;
@@ -223,7 +222,7 @@ static int uverbs_process_idrs_array(struct bundle_priv *pbundle,
 
 static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
                                  struct uverbs_objs_arr_attr *attr,
-                                 bool commit)
+                                 bool commit, struct uverbs_attr_bundle *attrs)
 {
        const struct uverbs_attr_spec *spec = &attr_uapi->spec;
        int current_ret;
@@ -231,8 +230,9 @@ static int uverbs_free_idrs_array(const struct uverbs_api_attr *attr_uapi,
        size_t i;
 
        for (i = 0; i != attr->len; i++) {
-               current_ret = uverbs_finalize_object(
-                       attr->uobjects[i], spec->u2.objs_arr.access, commit);
+               current_ret = uverbs_finalize_object(attr->uobjects[i],
+                                                    spec->u2.objs_arr.access,
+                                                    commit, attrs);
                if (!ret)
                        ret = current_ret;
        }
@@ -325,13 +325,10 @@ static int uverbs_process_attr(struct bundle_priv *pbundle,
                 * IDR implementation today rejects negative IDs
                 */
                o_attr->uobject = uverbs_get_uobject_from_file(
-                                       spec->u.obj.obj_type,
-                                       pbundle->bundle.ufile,
-                                       spec->u.obj.access,
-                                       uattr->data_s64);
+                       spec->u.obj.obj_type, spec->u.obj.access,
+                       uattr->data_s64, &pbundle->bundle);
                if (IS_ERR(o_attr->uobject))
                        return PTR_ERR(o_attr->uobject);
-               pbundle->bundle.context = o_attr->uobject->context;
                __set_bit(attr_bkey, pbundle->uobj_finalize);
 
                if (spec->u.obj.access == UVERBS_ACCESS_NEW) {
@@ -456,12 +453,14 @@ static int ib_uverbs_run_method(struct bundle_priv *pbundle,
                uverbs_fill_udata(&pbundle->bundle,
                                  &pbundle->bundle.driver_udata,
                                  UVERBS_ATTR_UHW_IN, UVERBS_ATTR_UHW_OUT);
+       else
+               pbundle->bundle.driver_udata = (struct ib_udata){};
 
        if (destroy_bkey != UVERBS_API_ATTR_BKEY_LEN) {
                struct uverbs_obj_attr *destroy_attr =
                        &pbundle->bundle.attrs[destroy_bkey].obj_attr;
 
-               ret = uobj_destroy(destroy_attr->uobject);
+               ret = uobj_destroy(destroy_attr->uobject, &pbundle->bundle);
                if (ret)
                        return ret;
                __clear_bit(destroy_bkey, pbundle->uobj_finalize);
@@ -512,7 +511,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
 
                current_ret = uverbs_finalize_object(
                        attr->obj_attr.uobject,
-                       attr->obj_attr.attr_elm->spec.u.obj.access, commit);
+                       attr->obj_attr.attr_elm->spec.u.obj.access, commit,
+                       &pbundle->bundle);
                if (!ret)
                        ret = current_ret;
        }
@@ -535,7 +535,8 @@ static int bundle_destroy(struct bundle_priv *pbundle, bool commit)
 
                if (attr_uapi->spec.type == UVERBS_ATTR_TYPE_IDRS_ARRAY) {
                        current_ret = uverbs_free_idrs_array(
-                               attr_uapi, &attr->objs_arr_attr, commit);
+                               attr_uapi, &attr->objs_arr_attr, commit,
+                               &pbundle->bundle);
                        if (!ret)
                                ret = current_ret;
                }
index 8b43dd9..84a5e9a 100644 (file)
@@ -723,7 +723,7 @@ static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
                         * then the command request structure starts
                         * with a '__aligned u64 response' member.
                         */
-                       ret = get_user(response, (const u64 *)buf);
+                       ret = get_user(response, (const u64 __user *)buf);
                        if (ret)
                                goto out_unlock;
 
@@ -926,43 +926,32 @@ static const struct vm_operations_struct rdma_umap_ops = {
        .fault = rdma_umap_fault,
 };
 
-static struct rdma_umap_priv *rdma_user_mmap_pre(struct ib_ucontext *ucontext,
-                                                struct vm_area_struct *vma,
-                                                unsigned long size)
+/*
+ * Map IO memory into a process. This is to be called by drivers as part of
+ * their mmap() functions if they wish to send something like PCI-E BAR memory
+ * to userspace.
+ */
+int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
+                     unsigned long pfn, unsigned long size, pgprot_t prot)
 {
        struct ib_uverbs_file *ufile = ucontext->ufile;
        struct rdma_umap_priv *priv;
 
        if (!(vma->vm_flags & VM_SHARED))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (vma->vm_end - vma->vm_start != size)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        /* Driver is using this wrong, must be called by ib_uverbs_mmap */
        if (WARN_ON(!vma->vm_file ||
                    vma->vm_file->private_data != ufile))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        lockdep_assert_held(&ufile->device->disassociate_srcu);
 
        priv = kzalloc(sizeof(*priv), GFP_KERNEL);
        if (!priv)
-               return ERR_PTR(-ENOMEM);
-       return priv;
-}
-
-/*
- * Map IO memory into a process. This is to be called by drivers as part of
- * their mmap() functions if they wish to send something like PCI-E BAR memory
- * to userspace.
- */
-int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
-                     unsigned long pfn, unsigned long size, pgprot_t prot)
-{
-       struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
-
-       if (IS_ERR(priv))
-               return PTR_ERR(priv);
+               return -ENOMEM;
 
        vma->vm_page_prot = prot;
        if (io_remap_pfn_range(vma, vma->vm_start, pfn, size, prot)) {
@@ -975,35 +964,6 @@ int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
 }
 EXPORT_SYMBOL(rdma_user_mmap_io);
 
-/*
- * The page case is here for a slightly different reason, the driver expects
- * to be able to free the page it is sharing to user space when it destroys
- * its ucontext, which means we need to zap the user space references.
- *
- * We could handle this differently by providing an API to allocate a shared
- * page and then only freeing the shared page when the last ufile is
- * destroyed.
- */
-int rdma_user_mmap_page(struct ib_ucontext *ucontext,
-                       struct vm_area_struct *vma, struct page *page,
-                       unsigned long size)
-{
-       struct rdma_umap_priv *priv = rdma_user_mmap_pre(ucontext, vma, size);
-
-       if (IS_ERR(priv))
-               return PTR_ERR(priv);
-
-       if (remap_pfn_range(vma, vma->vm_start, page_to_pfn(page), size,
-                           vma->vm_page_prot)) {
-               kfree(priv);
-               return -EAGAIN;
-       }
-
-       rdma_umap_priv_init(priv, vma);
-       return 0;
-}
-EXPORT_SYMBOL(rdma_user_mmap_page);
-
 void uverbs_user_mmap_disassociate(struct ib_uverbs_file *ufile)
 {
        struct rdma_umap_priv *priv, *next_priv;
@@ -1094,6 +1054,11 @@ static int ib_uverbs_open(struct inode *inode, struct file *filp)
                goto err;
        }
 
+       if (!rdma_dev_access_netns(ib_dev, current->nsproxy->net_ns)) {
+               ret = -EPERM;
+               goto err;
+       }
+
        /* In case IB device supports disassociate ucontext, there is no hard
         * dependency between uverbs device and its low level device.
         */
index f224cb7..35b2e2c 100644 (file)
 #include "uverbs.h"
 
 static int uverbs_free_ah(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
-       return rdma_destroy_ah((struct ib_ah *)uobject->object,
-                              RDMA_DESTROY_AH_SLEEPABLE);
+       return rdma_destroy_ah_user((struct ib_ah *)uobject->object,
+                                   RDMA_DESTROY_AH_SLEEPABLE,
+                                   &attrs->driver_udata);
 }
 
 static int uverbs_free_flow(struct ib_uobject *uobject,
-                           enum rdma_remove_reason why)
+                           enum rdma_remove_reason why,
+                           struct uverbs_attr_bundle *attrs)
 {
        struct ib_flow *flow = (struct ib_flow *)uobject->object;
        struct ib_uflow_object *uflow =
@@ -66,13 +69,15 @@ static int uverbs_free_flow(struct ib_uobject *uobject,
 }
 
 static int uverbs_free_mw(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        return uverbs_dealloc_mw((struct ib_mw *)uobject->object);
 }
 
 static int uverbs_free_qp(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        struct ib_qp *qp = uobject->object;
        struct ib_uqp_object *uqp =
@@ -93,19 +98,20 @@ static int uverbs_free_qp(struct ib_uobject *uobject,
                ib_uverbs_detach_umcast(qp, uqp);
        }
 
-       ret = ib_destroy_qp(qp);
+       ret = ib_destroy_qp_user(qp, &attrs->driver_udata);
        if (ib_is_destroy_retryable(ret, why, uobject))
                return ret;
 
        if (uqp->uxrcd)
                atomic_dec(&uqp->uxrcd->refcnt);
 
-       ib_uverbs_release_uevent(uobject->context->ufile, &uqp->uevent);
+       ib_uverbs_release_uevent(attrs->ufile, &uqp->uevent);
        return ret;
 }
 
 static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
-                                  enum rdma_remove_reason why)
+                                  enum rdma_remove_reason why,
+                                  struct uverbs_attr_bundle *attrs)
 {
        struct ib_rwq_ind_table *rwq_ind_tbl = uobject->object;
        struct ib_wq **ind_tbl = rwq_ind_tbl->ind_tbl;
@@ -120,23 +126,25 @@ static int uverbs_free_rwq_ind_tbl(struct ib_uobject *uobject,
 }
 
 static int uverbs_free_wq(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        struct ib_wq *wq = uobject->object;
        struct ib_uwq_object *uwq =
                container_of(uobject, struct ib_uwq_object, uevent.uobject);
        int ret;
 
-       ret = ib_destroy_wq(wq);
+       ret = ib_destroy_wq(wq, &attrs->driver_udata);
        if (ib_is_destroy_retryable(ret, why, uobject))
                return ret;
 
-       ib_uverbs_release_uevent(uobject->context->ufile, &uwq->uevent);
+       ib_uverbs_release_uevent(attrs->ufile, &uwq->uevent);
        return ret;
 }
 
 static int uverbs_free_srq(struct ib_uobject *uobject,
-                          enum rdma_remove_reason why)
+                          enum rdma_remove_reason why,
+                          struct uverbs_attr_bundle *attrs)
 {
        struct ib_srq *srq = uobject->object;
        struct ib_uevent_object *uevent =
@@ -144,7 +152,7 @@ static int uverbs_free_srq(struct ib_uobject *uobject,
        enum ib_srq_type  srq_type = srq->srq_type;
        int ret;
 
-       ret = ib_destroy_srq(srq);
+       ret = ib_destroy_srq_user(srq, &attrs->driver_udata);
        if (ib_is_destroy_retryable(ret, why, uobject))
                return ret;
 
@@ -155,12 +163,13 @@ static int uverbs_free_srq(struct ib_uobject *uobject,
                atomic_dec(&us->uxrcd->refcnt);
        }
 
-       ib_uverbs_release_uevent(uobject->context->ufile, uevent);
+       ib_uverbs_release_uevent(attrs->ufile, uevent);
        return ret;
 }
 
 static int uverbs_free_xrcd(struct ib_uobject *uobject,
-                           enum rdma_remove_reason why)
+                           enum rdma_remove_reason why,
+                           struct uverbs_attr_bundle *attrs)
 {
        struct ib_xrcd *xrcd = uobject->object;
        struct ib_uxrcd_object *uxrcd =
@@ -171,15 +180,16 @@ static int uverbs_free_xrcd(struct ib_uobject *uobject,
        if (ret)
                return ret;
 
-       mutex_lock(&uobject->context->ufile->device->xrcd_tree_mutex);
-       ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why);
-       mutex_unlock(&uobject->context->ufile->device->xrcd_tree_mutex);
+       mutex_lock(&attrs->ufile->device->xrcd_tree_mutex);
+       ret = ib_uverbs_dealloc_xrcd(uobject, xrcd, why, attrs);
+       mutex_unlock(&attrs->ufile->device->xrcd_tree_mutex);
 
        return ret;
 }
 
 static int uverbs_free_pd(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        struct ib_pd *pd = uobject->object;
        int ret;
@@ -188,7 +198,7 @@ static int uverbs_free_pd(struct ib_uobject *uobject,
        if (ret)
                return ret;
 
-       ib_dealloc_pd(pd);
+       ib_dealloc_pd_user(pd, &attrs->driver_udata);
        return 0;
 }
 
index 309c5e8..9f01330 100644 (file)
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_counters(struct ib_uobject *uobject,
-                               enum rdma_remove_reason why)
+                               enum rdma_remove_reason why,
+                               struct uverbs_attr_bundle *attrs)
 {
        struct ib_counters *counters = uobject->object;
        int ret;
@@ -52,7 +54,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_COUNTERS_CREATE)(
 {
        struct ib_uobject *uobj = uverbs_attr_get_uobject(
                attrs, UVERBS_ATTR_CREATE_COUNTERS_HANDLE);
-       struct ib_device *ib_dev = uobj->context->device;
+       struct ib_device *ib_dev = attrs->context->device;
        struct ib_counters *counters;
        int ret;
 
index a59ea89..db5c46a 100644 (file)
@@ -35,7 +35,8 @@
 #include "uverbs.h"
 
 static int uverbs_free_cq(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        struct ib_cq *cq = uobject->object;
        struct ib_uverbs_event_queue *ev_queue = cq->cq_context;
@@ -43,12 +44,12 @@ static int uverbs_free_cq(struct ib_uobject *uobject,
                container_of(uobject, struct ib_ucq_object, uobject);
        int ret;
 
-       ret = ib_destroy_cq(cq);
+       ret = ib_destroy_cq_user(cq, &attrs->driver_udata);
        if (ib_is_destroy_retryable(ret, why, uobject))
                return ret;
 
        ib_uverbs_release_ucq(
-               uobject->context->ufile,
+               attrs->ufile,
                ev_queue ? container_of(ev_queue,
                                        struct ib_uverbs_completion_event_file,
                                        ev_queue) :
@@ -63,7 +64,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        struct ib_ucq_object *obj = container_of(
                uverbs_attr_get_uobject(attrs, UVERBS_ATTR_CREATE_CQ_HANDLE),
                typeof(*obj), uobject);
-       struct ib_device *ib_dev = obj->uobject.context->device;
+       struct ib_device *ib_dev = attrs->context->device;
        int ret;
        u64 user_handle;
        struct ib_cq_init_attr attr = {};
@@ -110,8 +111,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_CQ_CREATE)(
        INIT_LIST_HEAD(&obj->comp_list);
        INIT_LIST_HEAD(&obj->async_list);
 
-       cq = ib_dev->ops.create_cq(ib_dev, &attr, obj->uobject.context,
-                                  &attrs->driver_udata);
+       cq = ib_dev->ops.create_cq(ib_dev, &attr, &attrs->driver_udata);
        if (IS_ERR(cq)) {
                ret = PTR_ERR(cq);
                goto err_event_file;
index 2ef7063..d5a1de3 100644 (file)
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_dm(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
        struct ib_dm *dm = uobject->object;
        int ret;
@@ -43,7 +45,7 @@ static int uverbs_free_dm(struct ib_uobject *uobject,
        if (ret)
                return ret;
 
-       return dm->device->ops.dealloc_dm(dm);
+       return dm->device->ops.dealloc_dm(dm, attrs);
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
@@ -53,7 +55,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
        struct ib_uobject *uobj =
                uverbs_attr_get(attrs, UVERBS_ATTR_ALLOC_DM_HANDLE)
                        ->obj_attr.uobject;
-       struct ib_device *ib_dev = uobj->context->device;
+       struct ib_device *ib_dev = attrs->context->device;
        struct ib_dm *dm;
        int ret;
 
@@ -70,7 +72,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_ALLOC)(
        if (ret)
                return ret;
 
-       dm = ib_dev->ops.alloc_dm(ib_dev, uobj->context, &attr, attrs);
+       dm = ib_dev->ops.alloc_dm(ib_dev, attrs->context, &attr, attrs);
        if (IS_ERR(dm))
                return PTR_ERR(dm);
 
index 4962b87..459cf16 100644 (file)
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_flow_action(struct ib_uobject *uobject,
-                                  enum rdma_remove_reason why)
+                                  enum rdma_remove_reason why,
+                                  struct uverbs_attr_bundle *attrs)
 {
        struct ib_flow_action *action = uobject->object;
        int ret;
@@ -308,7 +310,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_FLOW_ACTION_ESP_CREATE)(
 {
        struct ib_uobject *uobj = uverbs_attr_get_uobject(
                attrs, UVERBS_ATTR_CREATE_FLOW_ACTION_ESP_HANDLE);
-       struct ib_device *ib_dev = uobj->context->device;
+       struct ib_device *ib_dev = attrs->context->device;
        int                               ret;
        struct ib_flow_action             *action;
        struct ib_flow_action_esp_attr    esp_attr = {};
index 4d4be0c..610d3b9 100644 (file)
  * SOFTWARE.
  */
 
+#include "rdma_core.h"
 #include "uverbs.h"
 #include <rdma/uverbs_std_types.h>
 
 static int uverbs_free_mr(struct ib_uobject *uobject,
-                         enum rdma_remove_reason why)
+                         enum rdma_remove_reason why,
+                         struct uverbs_attr_bundle *attrs)
 {
-       return ib_dereg_mr((struct ib_mr *)uobject->object);
+       return ib_dereg_mr_user((struct ib_mr *)uobject->object,
+                               &attrs->driver_udata);
 }
 
 static int UVERBS_HANDLER(UVERBS_METHOD_ADVISE_MR)(
@@ -145,7 +148,7 @@ static int UVERBS_HANDLER(UVERBS_METHOD_DM_MR_REG)(
        return 0;
 
 err_dereg:
-       ib_dereg_mr(mr);
+       ib_dereg_mr_user(mr, &attrs->driver_udata);
 
        return ret;
 }
index 5a5e83f..e666a1f 100644 (file)
@@ -218,6 +218,8 @@ rdma_node_get_transport(enum rdma_node_type node_type)
                return RDMA_TRANSPORT_USNIC_UDP;
        if (node_type == RDMA_NODE_RNIC)
                return RDMA_TRANSPORT_IWARP;
+       if (node_type == RDMA_NODE_UNSPECIFIED)
+               return RDMA_TRANSPORT_UNSPECIFIED;
 
        return RDMA_TRANSPORT_IB;
 }
@@ -269,7 +271,7 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
        pd->res.type = RDMA_RESTRACK_PD;
        rdma_restrack_set_task(&pd->res, caller);
 
-       ret = device->ops.alloc_pd(pd, NULL, NULL);
+       ret = device->ops.alloc_pd(pd, NULL);
        if (ret) {
                kfree(pd);
                return ERR_PTR(ret);
@@ -316,17 +318,18 @@ EXPORT_SYMBOL(__ib_alloc_pd);
 /**
  * ib_dealloc_pd - Deallocates a protection domain.
  * @pd: The protection domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
  *
  * It is an error to call this function while any resources in the pd still
  * exist.  The caller is responsible to synchronously destroy them and
  * guarantee no new allocations will happen.
  */
-void ib_dealloc_pd(struct ib_pd *pd)
+void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata)
 {
        int ret;
 
        if (pd->__internal_mr) {
-               ret = pd->device->ops.dereg_mr(pd->__internal_mr);
+               ret = pd->device->ops.dereg_mr(pd->__internal_mr, NULL);
                WARN_ON(ret);
                pd->__internal_mr = NULL;
        }
@@ -336,10 +339,10 @@ void ib_dealloc_pd(struct ib_pd *pd)
        WARN_ON(atomic_read(&pd->usecnt));
 
        rdma_restrack_del(&pd->res);
-       pd->device->ops.dealloc_pd(pd);
+       pd->device->ops.dealloc_pd(pd, udata);
        kfree(pd);
 }
-EXPORT_SYMBOL(ib_dealloc_pd);
+EXPORT_SYMBOL(ib_dealloc_pd_user);
 
 /* Address handles */
 
@@ -495,25 +498,33 @@ static struct ib_ah *_rdma_create_ah(struct ib_pd *pd,
                                     u32 flags,
                                     struct ib_udata *udata)
 {
+       struct ib_device *device = pd->device;
        struct ib_ah *ah;
+       int ret;
 
        might_sleep_if(flags & RDMA_CREATE_AH_SLEEPABLE);
 
-       if (!pd->device->ops.create_ah)
+       if (!device->ops.create_ah)
                return ERR_PTR(-EOPNOTSUPP);
 
-       ah = pd->device->ops.create_ah(pd, ah_attr, flags, udata);
+       ah = rdma_zalloc_drv_obj_gfp(
+               device, ib_ah,
+               (flags & RDMA_CREATE_AH_SLEEPABLE) ? GFP_KERNEL : GFP_ATOMIC);
+       if (!ah)
+               return ERR_PTR(-ENOMEM);
 
-       if (!IS_ERR(ah)) {
-               ah->device  = pd->device;
-               ah->pd      = pd;
-               ah->uobject = NULL;
-               ah->type    = ah_attr->type;
-               ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
+       ah->device = device;
+       ah->pd = pd;
+       ah->type = ah_attr->type;
+       ah->sgid_attr = rdma_update_sgid_attr(ah_attr, NULL);
 
-               atomic_inc(&pd->usecnt);
+       ret = device->ops.create_ah(ah, ah_attr, flags, udata);
+       if (ret) {
+               kfree(ah);
+               return ERR_PTR(ret);
        }
 
+       atomic_inc(&pd->usecnt);
        return ah;
 }
 
@@ -930,25 +941,24 @@ int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr)
 }
 EXPORT_SYMBOL(rdma_query_ah);
 
-int rdma_destroy_ah(struct ib_ah *ah, u32 flags)
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata)
 {
        const struct ib_gid_attr *sgid_attr = ah->sgid_attr;
        struct ib_pd *pd;
-       int ret;
 
        might_sleep_if(flags & RDMA_DESTROY_AH_SLEEPABLE);
 
        pd = ah->pd;
-       ret = ah->device->ops.destroy_ah(ah, flags);
-       if (!ret) {
-               atomic_dec(&pd->usecnt);
-               if (sgid_attr)
-                       rdma_put_gid_attr(sgid_attr);
-       }
 
-       return ret;
+       ah->device->ops.destroy_ah(ah, flags);
+       atomic_dec(&pd->usecnt);
+       if (sgid_attr)
+               rdma_put_gid_attr(sgid_attr);
+
+       kfree(ah);
+       return 0;
 }
-EXPORT_SYMBOL(rdma_destroy_ah);
+EXPORT_SYMBOL(rdma_destroy_ah_user);
 
 /* Shared receive queues */
 
@@ -956,29 +966,40 @@ struct ib_srq *ib_create_srq(struct ib_pd *pd,
                             struct ib_srq_init_attr *srq_init_attr)
 {
        struct ib_srq *srq;
+       int ret;
 
        if (!pd->device->ops.create_srq)
                return ERR_PTR(-EOPNOTSUPP);
 
-       srq = pd->device->ops.create_srq(pd, srq_init_attr, NULL);
-
-       if (!IS_ERR(srq)) {
-               srq->device        = pd->device;
-               srq->pd            = pd;
-               srq->uobject       = NULL;
-               srq->event_handler = srq_init_attr->event_handler;
-               srq->srq_context   = srq_init_attr->srq_context;
-               srq->srq_type      = srq_init_attr->srq_type;
-               if (ib_srq_has_cq(srq->srq_type)) {
-                       srq->ext.cq   = srq_init_attr->ext.cq;
-                       atomic_inc(&srq->ext.cq->usecnt);
-               }
-               if (srq->srq_type == IB_SRQT_XRC) {
-                       srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
-                       atomic_inc(&srq->ext.xrc.xrcd->usecnt);
-               }
-               atomic_inc(&pd->usecnt);
-               atomic_set(&srq->usecnt, 0);
+       srq = rdma_zalloc_drv_obj(pd->device, ib_srq);
+       if (!srq)
+               return ERR_PTR(-ENOMEM);
+
+       srq->device = pd->device;
+       srq->pd = pd;
+       srq->event_handler = srq_init_attr->event_handler;
+       srq->srq_context = srq_init_attr->srq_context;
+       srq->srq_type = srq_init_attr->srq_type;
+
+       if (ib_srq_has_cq(srq->srq_type)) {
+               srq->ext.cq = srq_init_attr->ext.cq;
+               atomic_inc(&srq->ext.cq->usecnt);
+       }
+       if (srq->srq_type == IB_SRQT_XRC) {
+               srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+               atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+       }
+       atomic_inc(&pd->usecnt);
+
+       ret = pd->device->ops.create_srq(srq, srq_init_attr, NULL);
+       if (ret) {
+               atomic_dec(&srq->pd->usecnt);
+               if (srq->srq_type == IB_SRQT_XRC)
+                       atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+               if (ib_srq_has_cq(srq->srq_type))
+                       atomic_dec(&srq->ext.cq->usecnt);
+               kfree(srq);
+               return ERR_PTR(ret);
        }
 
        return srq;
@@ -1003,36 +1024,23 @@ int ib_query_srq(struct ib_srq *srq,
 }
 EXPORT_SYMBOL(ib_query_srq);
 
-int ib_destroy_srq(struct ib_srq *srq)
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata)
 {
-       struct ib_pd *pd;
-       enum ib_srq_type srq_type;
-       struct ib_xrcd *uninitialized_var(xrcd);
-       struct ib_cq *uninitialized_var(cq);
-       int ret;
-
        if (atomic_read(&srq->usecnt))
                return -EBUSY;
 
-       pd = srq->pd;
-       srq_type = srq->srq_type;
-       if (ib_srq_has_cq(srq_type))
-               cq = srq->ext.cq;
-       if (srq_type == IB_SRQT_XRC)
-               xrcd = srq->ext.xrc.xrcd;
+       srq->device->ops.destroy_srq(srq, udata);
 
-       ret = srq->device->ops.destroy_srq(srq);
-       if (!ret) {
-               atomic_dec(&pd->usecnt);
-               if (srq_type == IB_SRQT_XRC)
-                       atomic_dec(&xrcd->usecnt);
-               if (ib_srq_has_cq(srq_type))
-                       atomic_dec(&cq->usecnt);
-       }
+       atomic_dec(&srq->pd->usecnt);
+       if (srq->srq_type == IB_SRQT_XRC)
+               atomic_dec(&srq->ext.xrc.xrcd->usecnt);
+       if (ib_srq_has_cq(srq->srq_type))
+               atomic_dec(&srq->ext.cq->usecnt);
+       kfree(srq);
 
-       return ret;
+       return 0;
 }
-EXPORT_SYMBOL(ib_destroy_srq);
+EXPORT_SYMBOL(ib_destroy_srq_user);
 
 /* Queue pairs */
 
@@ -1111,8 +1119,9 @@ struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
 }
 EXPORT_SYMBOL(ib_open_qp);
 
-static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
-                                  struct ib_qp_init_attr *qp_init_attr)
+static struct ib_qp *create_xrc_qp_user(struct ib_qp *qp,
+                                       struct ib_qp_init_attr *qp_init_attr,
+                                       struct ib_udata *udata)
 {
        struct ib_qp *real_qp = qp;
 
@@ -1134,8 +1143,9 @@ static struct ib_qp *create_xrc_qp(struct ib_qp *qp,
        return qp;
 }
 
-struct ib_qp *ib_create_qp(struct ib_pd *pd,
-                          struct ib_qp_init_attr *qp_init_attr)
+struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
+                               struct ib_qp_init_attr *qp_init_attr,
+                               struct ib_udata *udata)
 {
        struct ib_device *device = pd ? pd->device : qp_init_attr->xrcd->device;
        struct ib_qp *qp;
@@ -1164,7 +1174,6 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
        if (ret)
                goto err;
 
-       qp->real_qp    = qp;
        qp->qp_type    = qp_init_attr->qp_type;
        qp->rwq_ind_tbl = qp_init_attr->rwq_ind_tbl;
 
@@ -1176,7 +1185,8 @@ struct ib_qp *ib_create_qp(struct ib_pd *pd,
        qp->port = 0;
 
        if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
-               struct ib_qp *xrc_qp = create_xrc_qp(qp, qp_init_attr);
+               struct ib_qp *xrc_qp =
+                       create_xrc_qp_user(qp, qp_init_attr, udata);
 
                if (IS_ERR(xrc_qp)) {
                        ret = PTR_ERR(xrc_qp);
@@ -1230,7 +1240,7 @@ err:
        return ERR_PTR(ret);
 
 }
-EXPORT_SYMBOL(ib_create_qp);
+EXPORT_SYMBOL(ib_create_qp_user);
 
 static const struct {
        int                     valid;
@@ -1837,7 +1847,7 @@ static int __ib_destroy_shared_qp(struct ib_qp *qp)
        return 0;
 }
 
-int ib_destroy_qp(struct ib_qp *qp)
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata)
 {
        const struct ib_gid_attr *alt_path_sgid_attr = qp->alt_path_sgid_attr;
        const struct ib_gid_attr *av_sgid_attr = qp->av_sgid_attr;
@@ -1869,7 +1879,7 @@ int ib_destroy_qp(struct ib_qp *qp)
                rdma_rw_cleanup_mrs(qp);
 
        rdma_restrack_del(&qp->res);
-       ret = qp->device->ops.destroy_qp(qp);
+       ret = qp->device->ops.destroy_qp(qp, udata);
        if (!ret) {
                if (alt_path_sgid_attr)
                        rdma_put_gid_attr(alt_path_sgid_attr);
@@ -1894,7 +1904,7 @@ int ib_destroy_qp(struct ib_qp *qp)
 
        return ret;
 }
-EXPORT_SYMBOL(ib_destroy_qp);
+EXPORT_SYMBOL(ib_destroy_qp_user);
 
 /* Completion queues */
 
@@ -1907,7 +1917,7 @@ struct ib_cq *__ib_create_cq(struct ib_device *device,
 {
        struct ib_cq *cq;
 
-       cq = device->ops.create_cq(device, cq_attr, NULL, NULL);
+       cq = device->ops.create_cq(device, cq_attr, NULL);
 
        if (!IS_ERR(cq)) {
                cq->device        = device;
@@ -1933,15 +1943,15 @@ int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period)
 }
 EXPORT_SYMBOL(rdma_set_cq_moderation);
 
-int ib_destroy_cq(struct ib_cq *cq)
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata)
 {
        if (atomic_read(&cq->usecnt))
                return -EBUSY;
 
        rdma_restrack_del(&cq->res);
-       return cq->device->ops.destroy_cq(cq);
+       return cq->device->ops.destroy_cq(cq, udata);
 }
-EXPORT_SYMBOL(ib_destroy_cq);
+EXPORT_SYMBOL(ib_destroy_cq_user);
 
 int ib_resize_cq(struct ib_cq *cq, int cqe)
 {
@@ -1952,14 +1962,14 @@ EXPORT_SYMBOL(ib_resize_cq);
 
 /* Memory regions */
 
-int ib_dereg_mr(struct ib_mr *mr)
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata)
 {
        struct ib_pd *pd = mr->pd;
        struct ib_dm *dm = mr->dm;
        int ret;
 
        rdma_restrack_del(&mr->res);
-       ret = mr->device->ops.dereg_mr(mr);
+       ret = mr->device->ops.dereg_mr(mr, udata);
        if (!ret) {
                atomic_dec(&pd->usecnt);
                if (dm)
@@ -1968,13 +1978,14 @@ int ib_dereg_mr(struct ib_mr *mr)
 
        return ret;
 }
-EXPORT_SYMBOL(ib_dereg_mr);
+EXPORT_SYMBOL(ib_dereg_mr_user);
 
 /**
  * ib_alloc_mr() - Allocates a memory region
  * @pd:            protection domain associated with the region
  * @mr_type:       memory region type
  * @max_num_sg:    maximum sg entries available for registration.
+ * @udata:        user data or null for kernel objects
  *
  * Notes:
  * Memory registeration page/sg lists must not exceed max_num_sg.
@@ -1982,16 +1993,15 @@ EXPORT_SYMBOL(ib_dereg_mr);
  * max_num_sg * used_page_size.
  *
  */
-struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
-                         enum ib_mr_type mr_type,
-                         u32 max_num_sg)
+struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata)
 {
        struct ib_mr *mr;
 
        if (!pd->device->ops.alloc_mr)
                return ERR_PTR(-EOPNOTSUPP);
 
-       mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg);
+       mr = pd->device->ops.alloc_mr(pd, mr_type, max_num_sg, udata);
        if (!IS_ERR(mr)) {
                mr->device  = pd->device;
                mr->pd      = pd;
@@ -2005,7 +2015,7 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
 
        return mr;
 }
-EXPORT_SYMBOL(ib_alloc_mr);
+EXPORT_SYMBOL(ib_alloc_mr_user);
 
 /* "Fast" memory regions */
 
@@ -2138,7 +2148,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
        if (!device->ops.alloc_xrcd)
                return ERR_PTR(-EOPNOTSUPP);
 
-       xrcd = device->ops.alloc_xrcd(device, NULL, NULL);
+       xrcd = device->ops.alloc_xrcd(device, NULL);
        if (!IS_ERR(xrcd)) {
                xrcd->device = device;
                xrcd->inode = NULL;
@@ -2151,7 +2161,7 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller)
 }
 EXPORT_SYMBOL(__ib_alloc_xrcd);
 
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
        struct ib_qp *qp;
        int ret;
@@ -2166,7 +2176,7 @@ int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
                        return ret;
        }
 
-       return xrcd->device->ops.dealloc_xrcd(xrcd);
+       return xrcd->device->ops.dealloc_xrcd(xrcd, udata);
 }
 EXPORT_SYMBOL(ib_dealloc_xrcd);
 
@@ -2210,10 +2220,11 @@ struct ib_wq *ib_create_wq(struct ib_pd *pd,
 EXPORT_SYMBOL(ib_create_wq);
 
 /**
- * ib_destroy_wq - Destroys the specified WQ.
+ * ib_destroy_wq - Destroys the specified user WQ.
  * @wq: The WQ to destroy.
+ * @udata: Valid user data
  */
-int ib_destroy_wq(struct ib_wq *wq)
+int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
        int err;
        struct ib_cq *cq = wq->cq;
@@ -2222,7 +2233,7 @@ int ib_destroy_wq(struct ib_wq *wq)
        if (atomic_read(&wq->usecnt))
                return -EBUSY;
 
-       err = wq->device->ops.destroy_wq(wq);
+       err = wq->device->ops.destroy_wq(wq, udata);
        if (!err) {
                atomic_dec(&pd->usecnt);
                atomic_dec(&cq->usecnt);
@@ -2701,3 +2712,37 @@ int rdma_init_netdev(struct ib_device *device, u8 port_num,
                                             netdev, params.param);
 }
 EXPORT_SYMBOL(rdma_init_netdev);
+
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+                            struct scatterlist *sglist, unsigned int nents,
+                            unsigned long pgsz)
+{
+       memset(biter, 0, sizeof(struct ib_block_iter));
+       biter->__sg = sglist;
+       biter->__sg_nents = nents;
+
+       /* Driver provides best block size to use */
+       biter->__pg_bit = __fls(pgsz);
+}
+EXPORT_SYMBOL(__rdma_block_iter_start);
+
+bool __rdma_block_iter_next(struct ib_block_iter *biter)
+{
+       unsigned int block_offset;
+
+       if (!biter->__sg_nents || !biter->__sg)
+               return false;
+
+       biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance;
+       block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1);
+       biter->__sg_advance += BIT_ULL(biter->__pg_bit) - block_offset;
+
+       if (biter->__sg_advance >= sg_dma_len(biter->__sg)) {
+               biter->__sg_advance = 0;
+               biter->__sg = sg_next(biter->__sg);
+               biter->__sg_nents--;
+       }
+
+       return true;
+}
+EXPORT_SYMBOL(__rdma_block_iter_next);
index e4f31c1..77094be 100644 (file)
@@ -3,6 +3,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA)          += mthca/
 obj-$(CONFIG_INFINIBAND_QIB)           += qib/
 obj-$(CONFIG_INFINIBAND_CXGB3)         += cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4)         += cxgb4/
+obj-$(CONFIG_INFINIBAND_EFA)           += efa/
 obj-$(CONFIG_INFINIBAND_I40IW)         += i40iw/
 obj-$(CONFIG_MLX4_INFINIBAND)          += mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)          += mlx5/
index d25439c..51e8234 100644 (file)
@@ -1,10 +1,10 @@
 config INFINIBAND_BNXT_RE
-    tristate "Broadcom Netxtreme HCA support"
-    depends on 64BIT
-    depends on ETHERNET && NETDEVICES && PCI && INET && DCB
-    select NET_VENDOR_BROADCOM
-    select BNXT
-    ---help---
+        tristate "Broadcom Netxtreme HCA support"
+        depends on 64BIT
+        depends on ETHERNET && NETDEVICES && PCI && INET && DCB
+        select NET_VENDOR_BROADCOM
+        select BNXT
+        ---help---
          This driver supports Broadcom NetXtreme-E 10/25/40/50 gigabit
          RoCE HCAs.  To compile this driver as a module, choose M here:
          the module will be called bnxt_re.
index 071b2fc..2c3685f 100644 (file)
@@ -119,21 +119,6 @@ static int bnxt_re_build_sgl(struct ib_sge *ib_sg_list,
 }
 
 /* Device */
-struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num)
-{
-       struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-       struct net_device *netdev = NULL;
-
-       rcu_read_lock();
-       if (rdev)
-               netdev = rdev->netdev;
-       if (netdev)
-               dev_hold(netdev);
-
-       rcu_read_unlock();
-       return netdev;
-}
-
 int bnxt_re_query_device(struct ib_device *ibdev,
                         struct ib_device_attr *ib_attr,
                         struct ib_udata *udata)
@@ -375,8 +360,9 @@ int bnxt_re_add_gid(const struct ib_gid_attr *attr, void **context)
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(attr->device, ibdev);
        struct bnxt_qplib_sgid_tbl *sgid_tbl = &rdev->qplib_res.sgid_tbl;
 
-       if ((attr->ndev) && is_vlan_dev(attr->ndev))
-               vlan_id = vlan_dev_vlan_id(attr->ndev);
+       rc = rdma_read_gid_l2_fields(attr, &vlan_id, NULL);
+       if (rc)
+               return rc;
 
        rc = bnxt_qplib_add_sgid(sgid_tbl, (struct bnxt_qplib_gid *)&attr->gid,
                                 rdev->qplib_res.netdev->dev_addr,
@@ -564,7 +550,7 @@ fail:
 }
 
 /* Protection Domains */
-void bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
+void bnxt_re_dealloc_pd(struct ib_pd *ib_pd, struct ib_udata *udata)
 {
        struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
        struct bnxt_re_dev *rdev = pd->rdev;
@@ -576,14 +562,12 @@ void bnxt_re_dealloc_pd(struct ib_pd *ib_pd)
                                      &pd->qplib_pd);
 }
 
-int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *ucontext,
-                    struct ib_udata *udata)
+int bnxt_re_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
-       struct bnxt_re_ucontext *ucntx = container_of(ucontext,
-                                                     struct bnxt_re_ucontext,
-                                                     ib_uctx);
+       struct bnxt_re_ucontext *ucntx = rdma_udata_to_drv_context(
+               udata, struct bnxt_re_ucontext, ib_uctx);
        struct bnxt_re_pd *pd = container_of(ibpd, struct bnxt_re_pd, ib_pd);
        int rc;
 
@@ -635,20 +619,13 @@ fail:
 }
 
 /* Address Handles */
-int bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
+void bnxt_re_destroy_ah(struct ib_ah *ib_ah, u32 flags)
 {
        struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
        struct bnxt_re_dev *rdev = ah->rdev;
-       int rc;
 
-       rc = bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah,
-                                  !(flags & RDMA_DESTROY_AH_SLEEPABLE));
-       if (rc) {
-               dev_err(rdev_to_dev(rdev), "Failed to destroy HW AH");
-               return rc;
-       }
-       kfree(ah);
-       return 0;
+       bnxt_qplib_destroy_ah(&rdev->qplib_res, &ah->qplib_ah,
+                             !(flags & RDMA_DESTROY_AH_SLEEPABLE));
 }
 
 static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
@@ -669,26 +646,22 @@ static u8 bnxt_re_stack_to_dev_nw_type(enum rdma_network_type ntype)
        return nw_type;
 }
 
-struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
-                               struct rdma_ah_attr *ah_attr,
-                               u32 flags,
-                               struct ib_udata *udata)
+int bnxt_re_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
+                     u32 flags, struct ib_udata *udata)
 {
+       struct ib_pd *ib_pd = ib_ah->pd;
        struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
        const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
        struct bnxt_re_dev *rdev = pd->rdev;
        const struct ib_gid_attr *sgid_attr;
-       struct bnxt_re_ah *ah;
+       struct bnxt_re_ah *ah = container_of(ib_ah, struct bnxt_re_ah, ib_ah);
        u8 nw_type;
        int rc;
 
        if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
                dev_err(rdev_to_dev(rdev), "Failed to alloc AH: GRH not set");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
 
        ah->rdev = rdev;
        ah->qplib_ah.pd = &pd->qplib_pd;
@@ -718,7 +691,7 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
                                  !(flags & RDMA_CREATE_AH_SLEEPABLE));
        if (rc) {
                dev_err(rdev_to_dev(rdev), "Failed to allocate HW AH");
-               goto fail;
+               return rc;
        }
 
        /* Write AVID to shared page. */
@@ -735,11 +708,7 @@ struct ib_ah *bnxt_re_create_ah(struct ib_pd *ib_pd,
                spin_unlock_irqrestore(&uctx->sh_lock, flag);
        }
 
-       return &ah->ib_ah;
-
-fail:
-       kfree(ah);
-       return ERR_PTR(rc);
+       return 0;
 }
 
 int bnxt_re_modify_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
@@ -789,7 +758,7 @@ void bnxt_re_unlock_cqs(struct bnxt_re_qp *qp,
 }
 
 /* Queue Pairs */
-int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
+int bnxt_re_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
        struct bnxt_re_qp *qp = container_of(ib_qp, struct bnxt_re_qp, ib_qp);
        struct bnxt_re_dev *rdev = qp->rdev;
@@ -812,13 +781,8 @@ int bnxt_re_destroy_qp(struct ib_qp *ib_qp)
        bnxt_qplib_free_qp_res(&rdev->qplib_res, &qp->qplib_qp);
 
        if (ib_qp->qp_type == IB_QPT_GSI && rdev->qp1_sqp) {
-               rc = bnxt_qplib_destroy_ah(&rdev->qplib_res,
-                                          &rdev->sqp_ah->qplib_ah, false);
-               if (rc) {
-                       dev_err(rdev_to_dev(rdev),
-                               "Failed to destroy HW AH for shadow QP");
-                       return rc;
-               }
+               bnxt_qplib_destroy_ah(&rdev->qplib_res, &rdev->sqp_ah->qplib_ah,
+                                     false);
 
                bnxt_qplib_clean_qp(&qp->qplib_qp);
                rc = bnxt_qplib_destroy_qp(&rdev->qplib_res,
@@ -895,8 +859,9 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
                return PTR_ERR(umem);
 
        qp->sumem = umem;
-       qplib_qp->sq.sglist = umem->sg_head.sgl;
-       qplib_qp->sq.nmap = umem->nmap;
+       qplib_qp->sq.sg_info.sglist = umem->sg_head.sgl;
+       qplib_qp->sq.sg_info.npages = ib_umem_num_pages(umem);
+       qplib_qp->sq.sg_info.nmap = umem->nmap;
        qplib_qp->qp_handle = ureq.qp_handle;
 
        if (!qp->qplib_qp.srq) {
@@ -907,8 +872,9 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
                if (IS_ERR(umem))
                        goto rqfail;
                qp->rumem = umem;
-               qplib_qp->rq.sglist = umem->sg_head.sgl;
-               qplib_qp->rq.nmap = umem->nmap;
+               qplib_qp->rq.sg_info.sglist = umem->sg_head.sgl;
+               qplib_qp->rq.sg_info.npages = ib_umem_num_pages(umem);
+               qplib_qp->rq.sg_info.nmap = umem->nmap;
        }
 
        qplib_qp->dpi = &cntx->dpi;
@@ -916,8 +882,7 @@ static int bnxt_re_init_user_qp(struct bnxt_re_dev *rdev, struct bnxt_re_pd *pd,
 rqfail:
        ib_umem_release(qp->sumem);
        qp->sumem = NULL;
-       qplib_qp->sq.sglist = NULL;
-       qplib_qp->sq.nmap = 0;
+       memset(&qplib_qp->sq.sg_info, 0, sizeof(qplib_qp->sq.sg_info));
 
        return PTR_ERR(umem);
 }
@@ -1326,30 +1291,22 @@ static enum ib_mtu __to_ib_mtu(u32 mtu)
 }
 
 /* Shared Receive Queues */
-int bnxt_re_destroy_srq(struct ib_srq *ib_srq)
+void bnxt_re_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata)
 {
        struct bnxt_re_srq *srq = container_of(ib_srq, struct bnxt_re_srq,
                                               ib_srq);
        struct bnxt_re_dev *rdev = srq->rdev;
        struct bnxt_qplib_srq *qplib_srq = &srq->qplib_srq;
        struct bnxt_qplib_nq *nq = NULL;
-       int rc;
 
        if (qplib_srq->cq)
                nq = qplib_srq->cq->nq;
-       rc = bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
-       if (rc) {
-               dev_err(rdev_to_dev(rdev), "Destroy HW SRQ failed!");
-               return rc;
-       }
-
+       bnxt_qplib_destroy_srq(&rdev->qplib_res, qplib_srq);
        if (srq->umem)
                ib_umem_release(srq->umem);
-       kfree(srq);
        atomic_dec(&rdev->srq_count);
        if (nq)
                nq->budget--;
-       return 0;
 }
 
 static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
@@ -1374,22 +1331,25 @@ static int bnxt_re_init_user_srq(struct bnxt_re_dev *rdev,
                return PTR_ERR(umem);
 
        srq->umem = umem;
-       qplib_srq->nmap = umem->nmap;
-       qplib_srq->sglist = umem->sg_head.sgl;
+       qplib_srq->sg_info.sglist = umem->sg_head.sgl;
+       qplib_srq->sg_info.npages = ib_umem_num_pages(umem);
+       qplib_srq->sg_info.nmap = umem->nmap;
        qplib_srq->srq_handle = ureq.srq_handle;
        qplib_srq->dpi = &cntx->dpi;
 
        return 0;
 }
 
-struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
-                                 struct ib_srq_init_attr *srq_init_attr,
-                                 struct ib_udata *udata)
+int bnxt_re_create_srq(struct ib_srq *ib_srq,
+                      struct ib_srq_init_attr *srq_init_attr,
+                      struct ib_udata *udata)
 {
+       struct ib_pd *ib_pd = ib_srq->pd;
        struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
        struct bnxt_re_dev *rdev = pd->rdev;
        struct bnxt_qplib_dev_attr *dev_attr = &rdev->dev_attr;
-       struct bnxt_re_srq *srq;
+       struct bnxt_re_srq *srq =
+               container_of(ib_srq, struct bnxt_re_srq, ib_srq);
        struct bnxt_qplib_nq *nq = NULL;
        int rc, entries;
 
@@ -1404,11 +1364,6 @@ struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
                goto exit;
        }
 
-       srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq) {
-               rc = -ENOMEM;
-               goto exit;
-       }
        srq->rdev = rdev;
        srq->qplib_srq.pd = &pd->qplib_pd;
        srq->qplib_srq.dpi = &rdev->dpi_privileged;
@@ -1454,14 +1409,13 @@ struct ib_srq *bnxt_re_create_srq(struct ib_pd *ib_pd,
                nq->budget++;
        atomic_inc(&rdev->srq_count);
 
-       return &srq->ib_srq;
+       return 0;
 
 fail:
        if (srq->umem)
                ib_umem_release(srq->umem);
-       kfree(srq);
 exit:
-       return ERR_PTR(rc);
+       return rc;
 }
 
 int bnxt_re_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *srq_attr,
@@ -1684,8 +1638,11 @@ int bnxt_re_modify_qp(struct ib_qp *ib_qp, struct ib_qp_attr *qp_attr,
                                qp_attr->ah_attr.roce.dmac);
 
                sgid_attr = qp_attr->ah_attr.grh.sgid_attr;
-               memcpy(qp->qplib_qp.smac, sgid_attr->ndev->dev_addr,
-                      ETH_ALEN);
+               rc = rdma_read_gid_l2_fields(sgid_attr, NULL,
+                                            &qp->qplib_qp.smac[0]);
+               if (rc)
+                       return rc;
+
                nw_type = rdma_gid_attr_network_type(sgid_attr);
                switch (nw_type) {
                case RDMA_NETWORK_IPV4:
@@ -1904,8 +1861,10 @@ static int bnxt_re_build_qp1_send_v2(struct bnxt_re_qp *qp,
 
        memset(&qp->qp1_hdr, 0, sizeof(qp->qp1_hdr));
 
-       if (is_vlan_dev(sgid_attr->ndev))
-               vlan_id = vlan_dev_vlan_id(sgid_attr->ndev);
+       rc = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, NULL);
+       if (rc)
+               return rc;
+
        /* Get network header type for this GID */
        nw_type = rdma_gid_attr_network_type(sgid_attr);
        switch (nw_type) {
@@ -2558,7 +2517,7 @@ int bnxt_re_post_recv(struct ib_qp *ib_qp, const struct ib_recv_wr *wr,
 }
 
 /* Completion Queues */
-int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
+int bnxt_re_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        int rc;
        struct bnxt_re_cq *cq;
@@ -2587,7 +2546,6 @@ int bnxt_re_destroy_cq(struct ib_cq *ib_cq)
 
 struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata)
 {
        struct bnxt_re_dev *rdev = to_bnxt_re_dev(ibdev, ibdev);
@@ -2614,12 +2572,10 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
        if (entries > dev_attr->max_cq_wqes + 1)
                entries = dev_attr->max_cq_wqes + 1;
 
-       if (context) {
+       if (udata) {
                struct bnxt_re_cq_req req;
-               struct bnxt_re_ucontext *uctx = container_of
-                                               (context,
-                                                struct bnxt_re_ucontext,
-                                                ib_uctx);
+               struct bnxt_re_ucontext *uctx = rdma_udata_to_drv_context(
+                       udata, struct bnxt_re_ucontext, ib_uctx);
                if (ib_copy_from_udata(&req, udata, sizeof(req))) {
                        rc = -EFAULT;
                        goto fail;
@@ -2632,8 +2588,9 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                        rc = PTR_ERR(cq->umem);
                        goto fail;
                }
-               cq->qplib_cq.sghead = cq->umem->sg_head.sgl;
-               cq->qplib_cq.nmap = cq->umem->nmap;
+               cq->qplib_cq.sg_info.sglist = cq->umem->sg_head.sgl;
+               cq->qplib_cq.sg_info.npages = ib_umem_num_pages(cq->umem);
+               cq->qplib_cq.sg_info.nmap = cq->umem->nmap;
                cq->qplib_cq.dpi = &uctx->dpi;
        } else {
                cq->max_cql = min_t(u32, entries, MAX_CQL_PER_POLL);
@@ -2645,8 +2602,6 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                }
 
                cq->qplib_cq.dpi = &rdev->dpi_privileged;
-               cq->qplib_cq.sghead = NULL;
-               cq->qplib_cq.nmap = 0;
        }
        /*
         * Allocating the NQ in a round robin fashion. nq_alloc_cnt is a
@@ -2671,7 +2626,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
        atomic_inc(&rdev->cq_count);
        spin_lock_init(&cq->cq_lock);
 
-       if (context) {
+       if (udata) {
                struct bnxt_re_cq_resp resp;
 
                resp.cqid = cq->qplib_cq.id;
@@ -2689,7 +2644,7 @@ struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
        return &cq->ib_cq;
 
 c2fail:
-       if (context)
+       if (udata)
                ib_umem_release(cq->umem);
 fail:
        kfree(cq->cql);
@@ -3381,7 +3336,7 @@ fail:
        return ERR_PTR(rc);
 }
 
-int bnxt_re_dereg_mr(struct ib_mr *ib_mr)
+int bnxt_re_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct bnxt_re_mr *mr = container_of(ib_mr, struct bnxt_re_mr, ib_mr);
        struct bnxt_re_dev *rdev = mr->rdev;
@@ -3427,7 +3382,7 @@ int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
 }
 
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type type,
-                              u32 max_num_sg)
+                              u32 max_num_sg, struct ib_udata *udata)
 {
        struct bnxt_re_pd *pd = container_of(ib_pd, struct bnxt_re_pd, ib_pd);
        struct bnxt_re_dev *rdev = pd->rdev;
@@ -3552,17 +3507,12 @@ static int fill_umem_pbl_tbl(struct ib_umem *umem, u64 *pbl_tbl_orig,
                             int page_shift)
 {
        u64 *pbl_tbl = pbl_tbl_orig;
-       u64 paddr;
-       u64 page_mask = (1ULL << page_shift) - 1;
-       struct sg_dma_page_iter sg_iter;
+       u64 page_size =  BIT_ULL(page_shift);
+       struct ib_block_iter biter;
+
+       rdma_for_each_block(umem->sg_head.sgl, &biter, umem->nmap, page_size)
+               *pbl_tbl++ = rdma_block_iter_dma_address(&biter);
 
-       for_each_sg_dma_page (umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
-               paddr = sg_page_iter_dma_address(&sg_iter);
-               if (pbl_tbl == pbl_tbl_orig)
-                       *pbl_tbl++ = paddr & ~page_mask;
-               else if ((paddr & page_mask) == 0)
-                       *pbl_tbl++ = paddr;
-       }
        return pbl_tbl - pbl_tbl_orig;
 }
 
@@ -3624,7 +3574,9 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
                goto free_umem;
        }
 
-       page_shift = PAGE_SHIFT;
+       page_shift = __ffs(ib_umem_find_best_pgsz(umem,
+                               BNXT_RE_PAGE_SIZE_4K | BNXT_RE_PAGE_SIZE_2M,
+                               virt_addr));
 
        if (!bnxt_re_page_size_ok(page_shift)) {
                dev_err(rdev_to_dev(rdev), "umem page size unsupported!");
@@ -3632,17 +3584,13 @@ struct ib_mr *bnxt_re_reg_user_mr(struct ib_pd *ib_pd, u64 start, u64 length,
                goto fail;
        }
 
-       if (!umem->hugetlb && length > BNXT_RE_MAX_MR_SIZE_LOW) {
+       if (page_shift == BNXT_RE_PAGE_SHIFT_4K &&
+           length > BNXT_RE_MAX_MR_SIZE_LOW) {
                dev_err(rdev_to_dev(rdev), "Requested MR Sz:%llu Max sup:%llu",
                        length, (u64)BNXT_RE_MAX_MR_SIZE_LOW);
                rc = -EINVAL;
                goto fail;
        }
-       if (umem->hugetlb && length > BNXT_RE_PAGE_SIZE_2M) {
-               page_shift = BNXT_RE_PAGE_SHIFT_2M;
-               dev_warn(rdev_to_dev(rdev), "umem hugetlb set page_size %x",
-                        1 << page_shift);
-       }
 
        /* Map umem buf ptrs to the PBL */
        umem_pgs = fill_umem_pbl_tbl(umem, pbl_tbl, page_shift);
@@ -3709,7 +3657,7 @@ int bnxt_re_alloc_ucontext(struct ib_ucontext *ctx, struct ib_udata *udata)
        resp.chip_id0 = chip_met_rev_num;
        /* Future extension of chip info */
        resp.chip_id1 = 0;
-       /*Temp, Use idr_alloc instead */
+       /*Temp, Use xa_alloc instead */
        resp.dev_id = rdev->en_dev->pdev->devfn;
        resp.max_qp = rdev->qplib_ctx.qpc_count;
        resp.pg_size = PAGE_SIZE;
index e45465e..09a3304 100644 (file)
@@ -63,15 +63,15 @@ struct bnxt_re_pd {
 };
 
 struct bnxt_re_ah {
-       struct bnxt_re_dev      *rdev;
        struct ib_ah            ib_ah;
+       struct bnxt_re_dev      *rdev;
        struct bnxt_qplib_ah    qplib_ah;
 };
 
 struct bnxt_re_srq {
+       struct ib_srq           ib_srq;
        struct bnxt_re_dev      *rdev;
        u32                     srq_limit;
-       struct ib_srq           ib_srq;
        struct bnxt_qplib_srq   qplib_srq;
        struct ib_umem          *umem;
        spinlock_t              lock;           /* protect srq */
@@ -142,8 +142,6 @@ struct bnxt_re_ucontext {
        spinlock_t              sh_lock;        /* protect shpg */
 };
 
-struct net_device *bnxt_re_get_netdev(struct ib_device *ibdev, u8 port_num);
-
 int bnxt_re_query_device(struct ib_device *ibdev,
                         struct ib_device_attr *ib_attr,
                         struct ib_udata *udata);
@@ -163,24 +161,21 @@ int bnxt_re_query_gid(struct ib_device *ibdev, u8 port_num,
                      int index, union ib_gid *gid);
 enum rdma_link_layer bnxt_re_get_link_layer(struct ib_device *ibdev,
                                            u8 port_num);
-int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                    struct ib_udata *udata);
-void bnxt_re_dealloc_pd(struct ib_pd *pd);
-struct ib_ah *bnxt_re_create_ah(struct ib_pd *pd,
-                               struct rdma_ah_attr *ah_attr,
-                               u32 flags,
-                               struct ib_udata *udata);
+int bnxt_re_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void bnxt_re_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+int bnxt_re_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+                     struct ib_udata *udata);
 int bnxt_re_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 int bnxt_re_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
-int bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
-struct ib_srq *bnxt_re_create_srq(struct ib_pd *pd,
-                                 struct ib_srq_init_attr *srq_init_attr,
-                                 struct ib_udata *udata);
+void bnxt_re_destroy_ah(struct ib_ah *ah, u32 flags);
+int bnxt_re_create_srq(struct ib_srq *srq,
+                      struct ib_srq_init_attr *srq_init_attr,
+                      struct ib_udata *udata);
 int bnxt_re_modify_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
                       enum ib_srq_attr_mask srq_attr_mask,
                       struct ib_udata *udata);
 int bnxt_re_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int bnxt_re_destroy_srq(struct ib_srq *srq);
+void bnxt_re_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int bnxt_re_post_srq_recv(struct ib_srq *srq, const struct ib_recv_wr *recv_wr,
                          const struct ib_recv_wr **bad_recv_wr);
 struct ib_qp *bnxt_re_create_qp(struct ib_pd *pd,
@@ -190,16 +185,15 @@ int bnxt_re_modify_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                      int qp_attr_mask, struct ib_udata *udata);
 int bnxt_re_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                     int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-int bnxt_re_destroy_qp(struct ib_qp *qp);
+int bnxt_re_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int bnxt_re_post_send(struct ib_qp *qp, const struct ib_send_wr *send_wr,
                      const struct ib_send_wr **bad_send_wr);
 int bnxt_re_post_recv(struct ib_qp *qp, const struct ib_recv_wr *recv_wr,
                      const struct ib_recv_wr **bad_recv_wr);
 struct ib_cq *bnxt_re_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata);
-int bnxt_re_destroy_cq(struct ib_cq *cq);
+int bnxt_re_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int bnxt_re_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
 int bnxt_re_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
@@ -207,8 +201,8 @@ struct ib_mr *bnxt_re_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
 int bnxt_re_map_mr_sg(struct ib_mr *ib_mr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset);
 struct ib_mr *bnxt_re_alloc_mr(struct ib_pd *ib_pd, enum ib_mr_type mr_type,
-                              u32 max_num_sg);
-int bnxt_re_dereg_mr(struct ib_mr *mr);
+                              u32 max_num_sg, struct ib_udata *udata);
+int bnxt_re_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *bnxt_re_alloc_mw(struct ib_pd *ib_pd, enum ib_mw_type type,
                               struct ib_udata *udata);
 int bnxt_re_dealloc_mw(struct ib_mw *mw);
index 2bd24ac..814f959 100644 (file)
@@ -617,7 +617,6 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
        .get_dma_mr = bnxt_re_get_dma_mr,
        .get_hw_stats = bnxt_re_ib_get_hw_stats,
        .get_link_layer = bnxt_re_get_link_layer,
-       .get_netdev = bnxt_re_get_netdev,
        .get_port_immutable = bnxt_re_get_port_immutable,
        .map_mr_sg = bnxt_re_map_mr_sg,
        .mmap = bnxt_re_mmap,
@@ -637,13 +636,16 @@ static const struct ib_device_ops bnxt_re_dev_ops = {
        .query_srq = bnxt_re_query_srq,
        .reg_user_mr = bnxt_re_reg_user_mr,
        .req_notify_cq = bnxt_re_req_notify_cq,
+       INIT_RDMA_OBJ_SIZE(ib_ah, bnxt_re_ah, ib_ah),
        INIT_RDMA_OBJ_SIZE(ib_pd, bnxt_re_pd, ib_pd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, bnxt_re_srq, ib_srq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, bnxt_re_ucontext, ib_uctx),
 };
 
 static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
 {
        struct ib_device *ibdev = &rdev->ibdev;
+       int ret;
 
        /* ib device init */
        ibdev->owner = THIS_MODULE;
@@ -691,6 +693,10 @@ static int bnxt_re_register_ib(struct bnxt_re_dev *rdev)
        rdma_set_device_sysfs_group(ibdev, &bnxt_re_dev_attr_group);
        ibdev->driver_id = RDMA_DRIVER_BNXT_RE;
        ib_set_device_ops(ibdev, &bnxt_re_dev_ops);
+       ret = ib_device_set_netdev(&rdev->ibdev, rdev->netdev, 1);
+       if (ret)
+               return ret;
+
        return ib_register_device(ibdev, "bnxt_re%d");
 }
 
index 71c34d5..958c1ff 100644 (file)
@@ -478,7 +478,7 @@ int bnxt_qplib_alloc_nq(struct pci_dev *pdev, struct bnxt_qplib_nq *nq)
            nq->hwq.max_elements > BNXT_QPLIB_NQE_MAX_CNT)
                nq->hwq.max_elements = BNXT_QPLIB_NQE_MAX_CNT;
        hwq_type = bnxt_qplib_get_hwq_type(nq->res);
-       if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL, 0,
+       if (bnxt_qplib_alloc_init_hwq(nq->pdev, &nq->hwq, NULL,
                                      &nq->hwq.max_elements,
                                      BNXT_QPLIB_MAX_NQE_ENTRY_SIZE, 0,
                                      PAGE_SIZE, hwq_type))
@@ -507,7 +507,7 @@ static void bnxt_qplib_arm_srq(struct bnxt_qplib_srq *srq, u32 arm_type)
        writeq(val, db);
 }
 
-int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
                           struct bnxt_qplib_srq *srq)
 {
        struct bnxt_qplib_rcfw *rcfw = res->rcfw;
@@ -521,14 +521,12 @@ int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
        /* Configure the request */
        req.srq_cid = cpu_to_le32(srq->id);
 
-       rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req,
-                                         (void *)&resp, NULL, 0);
+       rc = bnxt_qplib_rcfw_send_message(rcfw, (struct cmdq_base *)&req,
+                                         (struct creq_base *)&resp, NULL, 0);
+       kfree(srq->swq);
        if (rc)
-               return rc;
-
+               return;
        bnxt_qplib_free_hwq(res->pdev, &srq->hwq);
-       kfree(srq->swq);
-       return 0;
 }
 
 int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
@@ -542,8 +540,8 @@ int bnxt_qplib_create_srq(struct bnxt_qplib_res *res,
        int rc, idx;
 
        srq->hwq.max_elements = srq->max_wqe;
-       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, srq->sglist,
-                                      srq->nmap, &srq->hwq.max_elements,
+       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &srq->hwq, &srq->sg_info,
+                                      &srq->hwq.max_elements,
                                       BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_QUEUE);
        if (rc)
@@ -742,7 +740,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
 
        /* SQ */
        sq->hwq.max_elements = sq->max_wqe;
-       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, NULL,
                                       &sq->hwq.max_elements,
                                       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -781,7 +779,7 @@ int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
        /* RQ */
        if (rq->max_wqe) {
                rq->hwq.max_elements = qp->rq.max_wqe;
-               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL, 0,
+               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, NULL,
                                               &rq->hwq.max_elements,
                                               BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
                                               PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -890,8 +888,8 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
                         sizeof(struct sq_psn_search);
        }
        sq->hwq.max_elements = sq->max_wqe;
-       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, sq->sglist,
-                                      sq->nmap, &sq->hwq.max_elements,
+       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &sq->hwq, &sq->sg_info,
+                                      &sq->hwq.max_elements,
                                       BNXT_QPLIB_MAX_SQE_ENTRY_SIZE,
                                       psn_sz,
                                       PAGE_SIZE, HWQ_TYPE_QUEUE);
@@ -959,8 +957,9 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
        /* RQ */
        if (rq->max_wqe) {
                rq->hwq.max_elements = rq->max_wqe;
-               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq, rq->sglist,
-                                              rq->nmap, &rq->hwq.max_elements,
+               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &rq->hwq,
+                                              &rq->sg_info,
+                                              &rq->hwq.max_elements,
                                               BNXT_QPLIB_MAX_RQE_ENTRY_SIZE, 0,
                                               PAGE_SIZE, HWQ_TYPE_QUEUE);
                if (rc)
@@ -1030,7 +1029,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
                req_size = xrrq->max_elements *
                           BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE + PAGE_SIZE - 1;
                req_size &= ~(PAGE_SIZE - 1);
-               rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+               rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
                                               &xrrq->max_elements,
                                               BNXT_QPLIB_MAX_ORRQE_ENTRY_SIZE,
                                               0, req_size, HWQ_TYPE_CTX);
@@ -1046,7 +1045,7 @@ int bnxt_qplib_create_qp(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp)
                           BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE + PAGE_SIZE - 1;
                req_size &= ~(PAGE_SIZE - 1);
 
-               rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL, 0,
+               rc = bnxt_qplib_alloc_init_hwq(res->pdev, xrrq, NULL,
                                               &xrrq->max_elements,
                                               BNXT_QPLIB_MAX_IRRQE_ENTRY_SIZE,
                                               0, req_size, HWQ_TYPE_CTX);
@@ -1935,8 +1934,8 @@ int bnxt_qplib_create_cq(struct bnxt_qplib_res *res, struct bnxt_qplib_cq *cq)
        int rc;
 
        cq->hwq.max_elements = cq->max_wqe;
-       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, cq->sghead,
-                                      cq->nmap, &cq->hwq.max_elements,
+       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &cq->hwq, &cq->sg_info,
+                                      &cq->hwq.max_elements,
                                       BNXT_QPLIB_MAX_CQE_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_QUEUE);
        if (rc)
index 3f618b5..99e0a13 100644 (file)
@@ -52,10 +52,9 @@ struct bnxt_qplib_srq {
        struct bnxt_qplib_cq            *cq;
        struct bnxt_qplib_hwq           hwq;
        struct bnxt_qplib_swq           *swq;
-       struct scatterlist              *sglist;
        int                             start_idx;
        int                             last_idx;
-       u32                             nmap;
+       struct bnxt_qplib_sg_info       sg_info;
        u16                             eventq_hw_ring_id;
        spinlock_t                      lock; /* protect SRQE link list */
 };
@@ -237,8 +236,7 @@ struct bnxt_qplib_swqe {
 struct bnxt_qplib_q {
        struct bnxt_qplib_hwq           hwq;
        struct bnxt_qplib_swq           *swq;
-       struct scatterlist              *sglist;
-       u32                             nmap;
+       struct bnxt_qplib_sg_info       sg_info;
        u32                             max_wqe;
        u16                             q_full_delta;
        u16                             max_sge;
@@ -381,8 +379,7 @@ struct bnxt_qplib_cq {
        u32                             cnq_hw_ring_id;
        struct bnxt_qplib_nq            *nq;
        bool                            resize_in_progress;
-       struct scatterlist              *sghead;
-       u32                             nmap;
+       struct bnxt_qplib_sg_info       sg_info;
        u64                             cq_handle;
 
 #define CQ_RESIZE_WAIT_TIME_MS         500
@@ -521,8 +518,8 @@ int bnxt_qplib_modify_srq(struct bnxt_qplib_res *res,
                          struct bnxt_qplib_srq *srq);
 int bnxt_qplib_query_srq(struct bnxt_qplib_res *res,
                         struct bnxt_qplib_srq *srq);
-int bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
-                          struct bnxt_qplib_srq *srq);
+void bnxt_qplib_destroy_srq(struct bnxt_qplib_res *res,
+                           struct bnxt_qplib_srq *srq);
 int bnxt_qplib_post_srq_recv(struct bnxt_qplib_srq *srq,
                             struct bnxt_qplib_swqe *wqe);
 int bnxt_qplib_create_qp1(struct bnxt_qplib_res *res, struct bnxt_qplib_qp *qp);
index c6461e9..48b04d2 100644 (file)
@@ -569,7 +569,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
        rcfw->pdev = pdev;
        rcfw->creq.max_elements = BNXT_QPLIB_CREQE_MAX_CNT;
        hwq_type = bnxt_qplib_get_hwq_type(rcfw->res);
-       if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL, 0,
+       if (bnxt_qplib_alloc_init_hwq(rcfw->pdev, &rcfw->creq, NULL,
                                      &rcfw->creq.max_elements,
                                      BNXT_QPLIB_CREQE_UNITS,
                                      0, PAGE_SIZE, hwq_type)) {
@@ -584,7 +584,7 @@ int bnxt_qplib_alloc_rcfw_channel(struct pci_dev *pdev,
 
        rcfw->cmdq.max_elements = rcfw->cmdq_depth;
        if (bnxt_qplib_alloc_init_hwq
-                       (rcfw->pdev, &rcfw->cmdq, NULL, 0,
+                       (rcfw->pdev, &rcfw->cmdq, NULL,
                         &rcfw->cmdq.max_elements,
                         BNXT_QPLIB_CMDQE_UNITS, 0,
                         bnxt_qplib_cmdqe_page_size(rcfw->cmdq_depth),
index 0bc24f9..37928b1 100644 (file)
@@ -83,7 +83,8 @@ static void __free_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
 }
 
 static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
-                      struct scatterlist *sghead, u32 pages, u32 pg_size)
+                      struct scatterlist *sghead, u32 pages,
+                      u32 nmaps, u32 pg_size)
 {
        struct sg_dma_page_iter sg_iter;
        bool is_umem = false;
@@ -116,7 +117,7 @@ static int __alloc_pbl(struct pci_dev *pdev, struct bnxt_qplib_pbl *pbl,
        } else {
                i = 0;
                is_umem = true;
-               for_each_sg_dma_page (sghead, &sg_iter, pages, 0) {
+               for_each_sg_dma_page(sghead, &sg_iter, nmaps, 0) {
                        pbl->pg_map_arr[i] = sg_page_iter_dma_address(&sg_iter);
                        pbl->pg_arr[i] = NULL;
                        pbl->pg_count++;
@@ -158,12 +159,13 @@ void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq)
 
 /* All HWQs are power of 2 in size */
 int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-                             struct scatterlist *sghead, int nmap,
+                             struct bnxt_qplib_sg_info *sg_info,
                              u32 *elements, u32 element_size, u32 aux,
                              u32 pg_size, enum bnxt_qplib_hwq_type hwq_type)
 {
-       u32 pages, slots, size, aux_pages = 0, aux_size = 0;
+       u32 pages, maps, slots, size, aux_pages = 0, aux_size = 0;
        dma_addr_t *src_phys_ptr, **dst_virt_ptr;
+       struct scatterlist *sghead = NULL;
        int i, rc;
 
        hwq->level = PBL_LVL_MAX;
@@ -177,6 +179,9 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
        }
        size = roundup_pow_of_two(element_size);
 
+       if (sg_info)
+               sghead = sg_info->sglist;
+
        if (!sghead) {
                hwq->is_user = false;
                pages = (slots * size) / pg_size + aux_pages;
@@ -184,17 +189,20 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
                        pages++;
                if (!pages)
                        return -EINVAL;
+               maps = 0;
        } else {
                hwq->is_user = true;
-               pages = nmap;
+               pages = sg_info->npages;
+               maps = sg_info->nmap;
        }
 
        /* Alloc the 1st memory block; can be a PDL/PTL/PBL */
        if (sghead && (pages == MAX_PBL_LVL_0_PGS))
                rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], sghead,
-                                pages, pg_size);
+                                pages, maps, pg_size);
        else
-               rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL, 1, pg_size);
+               rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_0], NULL,
+                                1, 0, pg_size);
        if (rc)
                goto fail;
 
@@ -204,7 +212,8 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
                if (pages > MAX_PBL_LVL_1_PGS) {
                        /* 2 levels of indirection */
                        rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], NULL,
-                                        MAX_PBL_LVL_1_PGS_FOR_LVL_2, pg_size);
+                                        MAX_PBL_LVL_1_PGS_FOR_LVL_2,
+                                        0, pg_size);
                        if (rc)
                                goto fail;
                        /* Fill in lvl0 PBL */
@@ -217,7 +226,7 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
                        hwq->level = PBL_LVL_1;
 
                        rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_2], sghead,
-                                        pages, pg_size);
+                                        pages, maps, pg_size);
                        if (rc)
                                goto fail;
 
@@ -246,7 +255,7 @@ int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
 
                        /* 1 level of indirection */
                        rc = __alloc_pbl(pdev, &hwq->pbl[PBL_LVL_1], sghead,
-                                        pages, pg_size);
+                                        pages, maps, pg_size);
                        if (rc)
                                goto fail;
                        /* Fill in lvl0 PBL */
@@ -339,7 +348,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* QPC Tables */
        ctx->qpc_tbl.max_elements = ctx->qpc_count;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->qpc_tbl, NULL,
                                       &ctx->qpc_tbl.max_elements,
                                       BNXT_QPLIB_MAX_QP_CTX_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -348,7 +357,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* MRW Tables */
        ctx->mrw_tbl.max_elements = ctx->mrw_count;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->mrw_tbl, NULL,
                                       &ctx->mrw_tbl.max_elements,
                                       BNXT_QPLIB_MAX_MRW_CTX_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -357,7 +366,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* SRQ Tables */
        ctx->srqc_tbl.max_elements = ctx->srqc_count;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->srqc_tbl, NULL,
                                       &ctx->srqc_tbl.max_elements,
                                       BNXT_QPLIB_MAX_SRQ_CTX_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -366,7 +375,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* CQ Tables */
        ctx->cq_tbl.max_elements = ctx->cq_count;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->cq_tbl, NULL,
                                       &ctx->cq_tbl.max_elements,
                                       BNXT_QPLIB_MAX_CQ_CTX_ENTRY_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_CTX);
@@ -375,7 +384,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* TQM Buffer */
        ctx->tqm_pde.max_elements = 512;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_pde, NULL,
                                       &ctx->tqm_pde.max_elements, sizeof(u64),
                                       0, PAGE_SIZE, HWQ_TYPE_CTX);
        if (rc)
@@ -386,7 +395,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
                        continue;
                ctx->tqm_tbl[i].max_elements = ctx->qpc_count *
                                               ctx->tqm_count[i];
-               rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL, 0,
+               rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tqm_tbl[i], NULL,
                                               &ctx->tqm_tbl[i].max_elements, 1,
                                               0, PAGE_SIZE, HWQ_TYPE_CTX);
                if (rc)
@@ -424,7 +433,7 @@ int bnxt_qplib_alloc_ctx(struct pci_dev *pdev,
 
        /* TIM Buffer */
        ctx->tim_tbl.max_elements = ctx->qpc_count * 16;
-       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(pdev, &ctx->tim_tbl, NULL,
                                       &ctx->tim_tbl.max_elements, 1,
                                       0, PAGE_SIZE, HWQ_TYPE_CTX);
        if (rc)
index 32cebd0..30c42c9 100644 (file)
@@ -219,6 +219,12 @@ static inline u8 bnxt_qplib_get_ring_type(struct bnxt_qplib_chip_ctx *cctx)
               RING_ALLOC_REQ_RING_TYPE_ROCE_CMPL;
 }
 
+struct bnxt_qplib_sg_info {
+       struct scatterlist              *sglist;
+       u32                             nmap;
+       u32                             npages;
+};
+
 #define to_bnxt_qplib(ptr, type, member)       \
        container_of(ptr, type, member)
 
@@ -227,7 +233,7 @@ struct bnxt_qplib_dev_attr;
 
 void bnxt_qplib_free_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq);
 int bnxt_qplib_alloc_init_hwq(struct pci_dev *pdev, struct bnxt_qplib_hwq *hwq,
-                             struct scatterlist *sl, int nmap, u32 *elements,
+                             struct bnxt_qplib_sg_info *sg_info, u32 *elements,
                              u32 elements_per_page, u32 aux, u32 pg_size,
                              enum bnxt_qplib_hwq_type hwq_type);
 void bnxt_qplib_get_guid(u8 *dev_addr, u8 *guid);
index e9c53e4..48793d3 100644 (file)
@@ -532,25 +532,21 @@ int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
        return 0;
 }
 
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
-                         bool block)
+void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+                          bool block)
 {
        struct bnxt_qplib_rcfw *rcfw = res->rcfw;
        struct cmdq_destroy_ah req;
        struct creq_destroy_ah_resp resp;
        u16 cmd_flags = 0;
-       int rc;
 
        /* Clean up the AH table in the device */
        RCFW_CMD_PREP(req, DESTROY_AH, cmd_flags);
 
        req.ah_cid = cpu_to_le32(ah->id);
 
-       rc = bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp,
-                                         NULL, block);
-       if (rc)
-               return rc;
-       return 0;
+       bnxt_qplib_rcfw_send_message(rcfw, (void *)&req, (void *)&resp, NULL,
+                                    block);
 }
 
 /* MRW */
@@ -684,7 +680,7 @@ int bnxt_qplib_reg_mr(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mr,
 
                mr->hwq.max_elements = pages;
                /* Use system PAGE_SIZE */
-               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL, 0,
+               rc = bnxt_qplib_alloc_init_hwq(res->pdev, &mr->hwq, NULL,
                                               &mr->hwq.max_elements,
                                               PAGE_SIZE, 0, PAGE_SIZE,
                                               HWQ_TYPE_CTX);
@@ -754,7 +750,7 @@ int bnxt_qplib_alloc_fast_reg_page_list(struct bnxt_qplib_res *res,
                return -ENOMEM;
 
        frpl->hwq.max_elements = pages;
-       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL, 0,
+       rc = bnxt_qplib_alloc_init_hwq(res->pdev, &frpl->hwq, NULL,
                                       &frpl->hwq.max_elements, PAGE_SIZE, 0,
                                       PAGE_SIZE, HWQ_TYPE_CTX);
        if (!rc)
index 39454b3..0ec3b12 100644 (file)
@@ -243,8 +243,8 @@ int bnxt_qplib_set_func_resources(struct bnxt_qplib_res *res,
                                  struct bnxt_qplib_ctx *ctx);
 int bnxt_qplib_create_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
                         bool block);
-int bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
-                         bool block);
+void bnxt_qplib_destroy_ah(struct bnxt_qplib_res *res, struct bnxt_qplib_ah *ah,
+                          bool block);
 int bnxt_qplib_alloc_mrw(struct bnxt_qplib_res *res,
                         struct bnxt_qplib_mrw *mrw);
 int bnxt_qplib_dereg_mrw(struct bnxt_qplib_res *res, struct bnxt_qplib_mrw *mrw,
index 83d2e19..53aa5c3 100644 (file)
@@ -64,7 +64,7 @@ enum t3_wr_flags {
        T3_SOLICITED_EVENT_FLAG = 0x04,
        T3_READ_FENCE_FLAG = 0x08,
        T3_LOCAL_FENCE_FLAG = 0x10
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_wr_opcode {
        T3_WR_BP = FW_WROPCODE_RI_BYPASS,
@@ -77,7 +77,7 @@ enum t3_wr_opcode {
        T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
        T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
        T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_rdma_opcode {
        T3_RDMA_WRITE,          /* IETF RDMAP v1.0 ... */
@@ -95,7 +95,7 @@ enum t3_rdma_opcode {
        T3_QP_MOD,
        T3_BYPASS,
        T3_RDMA_READ_REQ_WITH_INV,
-} __attribute__ ((packed));
+} __packed;
 
 static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
 {
@@ -306,7 +306,7 @@ enum t3_mpa_attrs {
        uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
        uP_RI_MPA_CRC_ENABLE = 0x4,
        uP_RI_MPA_IETF_ENABLE = 0x8
-} __attribute__ ((packed));
+} __packed;
 
 enum t3_qp_caps {
        uP_RI_QP_RDMA_READ_ENABLE = 0x01,
@@ -314,7 +314,7 @@ enum t3_qp_caps {
        uP_RI_QP_BIND_ENABLE = 0x04,
        uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
        uP_RI_QP_STAG0_ENABLE = 0x10
-} __attribute__ ((packed));
+} __packed;
 
 enum rdma_init_rtr_types {
        RTR_READ = 1,
index fb03bc4..56a8ab6 100644 (file)
@@ -62,37 +62,30 @@ struct cxgb3_client t3c_client = {
 static LIST_HEAD(dev_list);
 static DEFINE_MUTEX(dev_mutex);
 
-static int disable_qp_db(int id, void *p, void *data)
-{
-       struct iwch_qp *qhp = p;
-
-       cxio_disable_wq_db(&qhp->wq);
-       return 0;
-}
-
-static int enable_qp_db(int id, void *p, void *data)
-{
-       struct iwch_qp *qhp = p;
-
-       if (data)
-               ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
-       cxio_enable_wq_db(&qhp->wq);
-       return 0;
-}
-
 static void disable_dbs(struct iwch_dev *rnicp)
 {
-       spin_lock_irq(&rnicp->lock);
-       idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
-       spin_unlock_irq(&rnicp->lock);
+       unsigned long index;
+       struct iwch_qp *qhp;
+
+       xa_lock_irq(&rnicp->qps);
+       xa_for_each(&rnicp->qps, index, qhp)
+               cxio_disable_wq_db(&qhp->wq);
+       xa_unlock_irq(&rnicp->qps);
 }
 
 static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
 {
-       spin_lock_irq(&rnicp->lock);
-       idr_for_each(&rnicp->qpidr, enable_qp_db,
-                    (void *)(unsigned long)ring_db);
-       spin_unlock_irq(&rnicp->lock);
+       unsigned long index;
+       struct iwch_qp *qhp;
+
+       xa_lock_irq(&rnicp->qps);
+       xa_for_each(&rnicp->qps, index, qhp) {
+               if (ring_db)
+                       ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell,
+                                       qhp->wq.qpid);
+               cxio_enable_wq_db(&qhp->wq);
+       }
+       xa_unlock_irq(&rnicp->qps);
 }
 
 static void iwch_db_drop_task(struct work_struct *work)
@@ -105,10 +98,9 @@ static void iwch_db_drop_task(struct work_struct *work)
 static void rnic_init(struct iwch_dev *rnicp)
 {
        pr_debug("%s iwch_dev %p\n", __func__,  rnicp);
-       idr_init(&rnicp->cqidr);
-       idr_init(&rnicp->qpidr);
-       idr_init(&rnicp->mmidr);
-       spin_lock_init(&rnicp->lock);
+       xa_init_flags(&rnicp->cqs, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&rnicp->qps, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&rnicp->mrs, XA_FLAGS_LOCK_IRQ);
        INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
 
        rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
@@ -190,9 +182,9 @@ static void close_rnic_dev(struct t3cdev *tdev)
                        list_del(&dev->entry);
                        iwch_unregister_device(dev);
                        cxio_rdev_close(&dev->rdev);
-                       idr_destroy(&dev->cqidr);
-                       idr_destroy(&dev->qpidr);
-                       idr_destroy(&dev->mmidr);
+                       WARN_ON(!xa_empty(&dev->cqs));
+                       WARN_ON(!xa_empty(&dev->qps));
+                       WARN_ON(!xa_empty(&dev->mrs));
                        ib_dealloc_device(&dev->ibdev);
                        break;
                }
index c69bc4f..310a937 100644 (file)
@@ -35,7 +35,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/workqueue.h>
 
 #include <rdma/ib_verbs.h>
@@ -106,10 +106,9 @@ struct iwch_dev {
        struct cxio_rdev rdev;
        u32 device_cap_flags;
        struct iwch_rnic_attributes attr;
-       struct idr cqidr;
-       struct idr qpidr;
-       struct idr mmidr;
-       spinlock_t lock;
+       struct xarray cqs;
+       struct xarray qps;
+       struct xarray mrs;
        struct list_head entry;
        struct delayed_work db_drop_task;
 };
@@ -136,40 +135,17 @@ static inline int t3a_device(const struct iwch_dev *rhp)
 
 static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
 {
-       return idr_find(&rhp->cqidr, cqid);
+       return xa_load(&rhp->cqs, cqid);
 }
 
 static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
 {
-       return idr_find(&rhp->qpidr, qpid);
+       return xa_load(&rhp->qps, qpid);
 }
 
 static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
 {
-       return idr_find(&rhp->mmidr, mmid);
-}
-
-static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
-                               void *handle, u32 id)
-{
-       int ret;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irq(&rhp->lock);
-
-       ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT);
-
-       spin_unlock_irq(&rhp->lock);
-       idr_preload_end();
-
-       return ret < 0 ? ret : 0;
-}
-
-static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
-{
-       spin_lock_irq(&rhp->lock);
-       idr_remove(idr, id);
-       spin_unlock_irq(&rhp->lock);
+       return xa_load(&rhp->mrs, mmid);
 }
 
 extern struct cxgb3_client t3c_client;
index 4a0c82a..9d356c1 100644 (file)
@@ -48,14 +48,14 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
        struct iwch_qp *qhp;
        unsigned long flag;
 
-       spin_lock(&rnicp->lock);
-       qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+       xa_lock(&rnicp->qps);
+       qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
 
        if (!qhp) {
                pr_err("%s unaffiliated error 0x%x qpid 0x%x\n",
                       __func__, CQE_STATUS(rsp_msg->cqe),
                       CQE_QPID(rsp_msg->cqe));
-               spin_unlock(&rnicp->lock);
+               xa_unlock(&rnicp->qps);
                return;
        }
 
@@ -65,7 +65,7 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
                         __func__,
                         qhp->attr.state, qhp->wq.qpid,
                         CQE_STATUS(rsp_msg->cqe));
-               spin_unlock(&rnicp->lock);
+               xa_unlock(&rnicp->qps);
                return;
        }
 
@@ -76,7 +76,7 @@ static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
               CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
 
        atomic_inc(&qhp->refcnt);
-       spin_unlock(&rnicp->lock);
+       xa_unlock(&rnicp->qps);
 
        if (qhp->attr.state == IWCH_QP_STATE_RTS) {
                attrs.next_state = IWCH_QP_STATE_TERMINATE;
@@ -114,21 +114,21 @@ void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
        unsigned long flag;
 
        rnicp = (struct iwch_dev *) rdev_p->ulp;
-       spin_lock(&rnicp->lock);
+       xa_lock(&rnicp->qps);
        chp = get_chp(rnicp, cqid);
-       qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+       qhp = xa_load(&rnicp->qps, CQE_QPID(rsp_msg->cqe));
        if (!chp || !qhp) {
                pr_err("BAD AE cqid 0x%x qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
                       cqid, CQE_QPID(rsp_msg->cqe),
                       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
                       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
                       CQE_WRID_LOW(rsp_msg->cqe));
-               spin_unlock(&rnicp->lock);
+               xa_unlock(&rnicp->qps);
                goto out;
        }
        iwch_qp_add_ref(&qhp->ibqp);
        atomic_inc(&chp->refcnt);
-       spin_unlock(&rnicp->lock);
+       xa_unlock(&rnicp->qps);
 
        /*
         * 1) completion of our sending a TERMINATE.
index 12886b1..ce0f274 100644 (file)
@@ -49,7 +49,7 @@ static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
        mmid = stag >> 8;
        mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
        pr_debug("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
-       return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+       return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
 }
 
 int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
index 4accf7b..3a481df 100644 (file)
@@ -88,14 +88,14 @@ static int iwch_alloc_ucontext(struct ib_ucontext *ucontext,
        return 0;
 }
 
-static int iwch_destroy_cq(struct ib_cq *ib_cq)
+static int iwch_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct iwch_cq *chp;
 
        pr_debug("%s ib_cq %p\n", __func__, ib_cq);
        chp = to_iwch_cq(ib_cq);
 
-       remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+       xa_erase_irq(&chp->rhp->cqs, chp->cq.cqid);
        atomic_dec(&chp->refcnt);
        wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
@@ -106,7 +106,6 @@ static int iwch_destroy_cq(struct ib_cq *ib_cq)
 
 static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
                                    const struct ib_cq_init_attr *attr,
-                                   struct ib_ucontext *ib_context,
                                    struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -114,7 +113,6 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        struct iwch_cq *chp;
        struct iwch_create_cq_resp uresp;
        struct iwch_create_cq_req ureq;
-       struct iwch_ucontext *ucontext = NULL;
        static int warned;
        size_t resplen;
 
@@ -127,8 +125,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        if (!chp)
                return ERR_PTR(-ENOMEM);
 
-       if (ib_context) {
-               ucontext = to_iwch_ucontext(ib_context);
+       if (udata) {
                if (!t3a_device(rhp)) {
                        if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
                                kfree(chp);
@@ -154,7 +151,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        entries = roundup_pow_of_two(entries);
        chp->cq.size_log2 = ilog2(entries);
 
-       if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
+       if (cxio_create_cq(&rhp->rdev, &chp->cq, !udata)) {
                kfree(chp);
                return ERR_PTR(-ENOMEM);
        }
@@ -164,18 +161,20 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
        spin_lock_init(&chp->comp_handler_lock);
        atomic_set(&chp->refcnt, 1);
        init_waitqueue_head(&chp->wait);
-       if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+       if (xa_store_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL)) {
                cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
                kfree(chp);
                return ERR_PTR(-ENOMEM);
        }
 
-       if (ucontext) {
+       if (udata) {
                struct iwch_mm_entry *mm;
+               struct iwch_ucontext *ucontext = rdma_udata_to_drv_context(
+                       udata, struct iwch_ucontext, ibucontext);
 
                mm = kmalloc(sizeof *mm, GFP_KERNEL);
                if (!mm) {
-                       iwch_destroy_cq(&chp->ibcq);
+                       iwch_destroy_cq(&chp->ibcq, udata);
                        return ERR_PTR(-ENOMEM);
                }
                uresp.cqid = chp->cq.cqid;
@@ -201,7 +200,7 @@ static struct ib_cq *iwch_create_cq(struct ib_device *ibdev,
                }
                if (ib_copy_to_udata(udata, &uresp, resplen)) {
                        kfree(mm);
-                       iwch_destroy_cq(&chp->ibcq);
+                       iwch_destroy_cq(&chp->ibcq, udata);
                        return ERR_PTR(-EFAULT);
                }
                insert_mmap(ucontext, mm);
@@ -367,7 +366,7 @@ static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
        return ret;
 }
 
-static void iwch_deallocate_pd(struct ib_pd *pd)
+static void iwch_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct iwch_dev *rhp;
        struct iwch_pd *php;
@@ -378,8 +377,7 @@ static void iwch_deallocate_pd(struct ib_pd *pd)
        cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
 }
 
-static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                           struct ib_udata *udata)
+static int iwch_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct iwch_pd *php = to_iwch_pd(pd);
        struct ib_device *ibdev = pd->device;
@@ -394,11 +392,11 @@ static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
 
        php->pdid = pdid;
        php->rhp = rhp;
-       if (context) {
+       if (udata) {
                struct iwch_alloc_pd_resp resp = {.pdid = php->pdid};
 
                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
-                       iwch_deallocate_pd(&php->ibpd);
+                       iwch_deallocate_pd(&php->ibpd, udata);
                        return -EFAULT;
                }
        }
@@ -406,7 +404,7 @@ static int iwch_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
        return 0;
 }
 
-static int iwch_dereg_mr(struct ib_mr *ib_mr)
+static int iwch_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct iwch_dev *rhp;
        struct iwch_mr *mhp;
@@ -421,7 +419,7 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr)
        cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
                       mhp->attr.pbl_addr);
        iwch_free_pbl(mhp);
-       remove_handle(rhp, &rhp->mmidr, mmid);
+       xa_erase_irq(&rhp->mrs, mmid);
        if (mhp->kva)
                kfree((void *) (unsigned long) mhp->kva);
        if (mhp->umem)
@@ -539,7 +537,7 @@ static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        shift = PAGE_SHIFT;
 
-       n = mhp->umem->nmap;
+       n = ib_umem_num_pages(mhp->umem);
 
        err = iwch_alloc_pbl(mhp, n);
        if (err)
@@ -590,7 +588,7 @@ pbl_done:
                         uresp.pbl_addr);
 
                if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
-                       iwch_dereg_mr(&mhp->ibmr);
+                       iwch_dereg_mr(&mhp->ibmr, udata);
                        err = -EFAULT;
                        goto err;
                }
@@ -636,7 +634,7 @@ static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
        mhp->attr.stag = stag;
        mmid = (stag) >> 8;
        mhp->ibmw.rkey = stag;
-       if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+       if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
                cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
                kfree(mhp);
                return ERR_PTR(-ENOMEM);
@@ -655,15 +653,14 @@ static int iwch_dealloc_mw(struct ib_mw *mw)
        rhp = mhp->rhp;
        mmid = (mw->rkey) >> 8;
        cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
-       remove_handle(rhp, &rhp->mmidr, mmid);
+       xa_erase_irq(&rhp->mrs, mmid);
        pr_debug("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
        kfree(mhp);
        return 0;
 }
 
-static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
-                                  enum ib_mr_type mr_type,
-                                  u32 max_num_sg)
+static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                                  u32 max_num_sg, struct ib_udata *udata)
 {
        struct iwch_dev *rhp;
        struct iwch_pd *php;
@@ -701,7 +698,7 @@ static struct ib_mr *iwch_alloc_mr(struct ib_pd *pd,
        mhp->attr.state = 1;
        mmid = (stag) >> 8;
        mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-       ret = insert_handle(rhp, &rhp->mmidr, mhp, mmid);
+       ret = xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL);
        if (ret)
                goto err3;
 
@@ -742,7 +739,7 @@ static int iwch_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
        return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, iwch_set_page);
 }
 
-static int iwch_destroy_qp(struct ib_qp *ib_qp)
+static int iwch_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
        struct iwch_dev *rhp;
        struct iwch_qp *qhp;
@@ -756,13 +753,13 @@ static int iwch_destroy_qp(struct ib_qp *ib_qp)
        iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
        wait_event(qhp->wait, !qhp->ep);
 
-       remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+       xa_erase_irq(&rhp->qps, qhp->wq.qpid);
 
        atomic_dec(&qhp->refcnt);
        wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
 
-       ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
-                                 : NULL;
+       ucontext = rdma_udata_to_drv_context(udata, struct iwch_ucontext,
+                                            ibucontext);
        cxio_destroy_qp(&rhp->rdev, &qhp->wq,
                        ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
 
@@ -872,7 +869,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
        init_waitqueue_head(&qhp->wait);
        atomic_set(&qhp->refcnt, 1);
 
-       if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+       if (xa_store_irq(&rhp->qps, qhp->wq.qpid, qhp, GFP_KERNEL)) {
                cxio_destroy_qp(&rhp->rdev, &qhp->wq,
                        ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
                kfree(qhp);
@@ -885,14 +882,14 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
 
                mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
                if (!mm1) {
-                       iwch_destroy_qp(&qhp->ibqp);
+                       iwch_destroy_qp(&qhp->ibqp, udata);
                        return ERR_PTR(-ENOMEM);
                }
 
                mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
                if (!mm2) {
                        kfree(mm1);
-                       iwch_destroy_qp(&qhp->ibqp);
+                       iwch_destroy_qp(&qhp->ibqp, udata);
                        return ERR_PTR(-ENOMEM);
                }
 
@@ -909,7 +906,7 @@ static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
                if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
                        kfree(mm1);
                        kfree(mm2);
-                       iwch_destroy_qp(&qhp->ibqp);
+                       iwch_destroy_qp(&qhp->ibqp, udata);
                        return ERR_PTR(-EFAULT);
                }
                mm1->key = uresp.key;
@@ -1324,6 +1321,14 @@ static const struct ib_device_ops iwch_dev_ops = {
        .get_dma_mr = iwch_get_dma_mr,
        .get_hw_stats = iwch_get_mib,
        .get_port_immutable = iwch_port_immutable,
+       .iw_accept = iwch_accept_cr,
+       .iw_add_ref = iwch_qp_add_ref,
+       .iw_connect = iwch_connect,
+       .iw_create_listen = iwch_create_listen,
+       .iw_destroy_listen = iwch_destroy_listen,
+       .iw_get_qp = iwch_get_qp,
+       .iw_reject = iwch_reject_cr,
+       .iw_rem_ref = iwch_qp_rem_ref,
        .map_mr_sg = iwch_map_mr_sg,
        .mmap = iwch_mmap,
        .modify_qp = iwch_ib_modify_qp,
@@ -1343,8 +1348,6 @@ static const struct ib_device_ops iwch_dev_ops = {
 
 int iwch_register_device(struct iwch_dev *dev)
 {
-       int ret;
-
        pr_debug("%s iwch_dev %p\n", __func__, dev);
        memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
        memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
@@ -1382,34 +1385,18 @@ int iwch_register_device(struct iwch_dev *dev)
        dev->ibdev.dev.parent = &dev->rdev.rnic_info.pdev->dev;
        dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
 
-       dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
-       if (!dev->ibdev.iwcm)
-               return -ENOMEM;
-
-       dev->ibdev.iwcm->connect = iwch_connect;
-       dev->ibdev.iwcm->accept = iwch_accept_cr;
-       dev->ibdev.iwcm->reject = iwch_reject_cr;
-       dev->ibdev.iwcm->create_listen = iwch_create_listen;
-       dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
-       dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
-       dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
-       dev->ibdev.iwcm->get_qp = iwch_get_qp;
-       memcpy(dev->ibdev.iwcm->ifname, dev->rdev.t3cdev_p->lldev->name,
-              sizeof(dev->ibdev.iwcm->ifname));
+       memcpy(dev->ibdev.iw_ifname, dev->rdev.t3cdev_p->lldev->name,
+              sizeof(dev->ibdev.iw_ifname));
 
        dev->ibdev.driver_id = RDMA_DRIVER_CXGB3;
        rdma_set_device_sysfs_group(&dev->ibdev, &iwch_attr_group);
        ib_set_device_ops(&dev->ibdev, &iwch_dev_ops);
-       ret = ib_register_device(&dev->ibdev, "cxgb3_%d");
-       if (ret)
-               kfree(dev->ibdev.iwcm);
-       return ret;
+       return ib_register_device(&dev->ibdev, "cxgb3_%d");
 }
 
 void iwch_unregister_device(struct iwch_dev *dev)
 {
        pr_debug("%s iwch_dev %p\n", __func__, dev);
        ib_unregister_device(&dev->ibdev);
-       kfree(dev->ibdev.iwcm);
        return;
 }
index 4d232bd..0f3b119 100644 (file)
@@ -331,20 +331,23 @@ static void remove_ep_tid(struct c4iw_ep *ep)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&ep->com.dev->lock, flags);
-       _remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid, 0);
-       if (idr_is_empty(&ep->com.dev->hwtid_idr))
+       xa_lock_irqsave(&ep->com.dev->hwtids, flags);
+       __xa_erase(&ep->com.dev->hwtids, ep->hwtid);
+       if (xa_empty(&ep->com.dev->hwtids))
                wake_up(&ep->com.dev->wait);
-       spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+       xa_unlock_irqrestore(&ep->com.dev->hwtids, flags);
 }
 
-static void insert_ep_tid(struct c4iw_ep *ep)
+static int insert_ep_tid(struct c4iw_ep *ep)
 {
        unsigned long flags;
+       int err;
+
+       xa_lock_irqsave(&ep->com.dev->hwtids, flags);
+       err = __xa_insert(&ep->com.dev->hwtids, ep->hwtid, ep, GFP_KERNEL);
+       xa_unlock_irqrestore(&ep->com.dev->hwtids, flags);
 
-       spin_lock_irqsave(&ep->com.dev->lock, flags);
-       _insert_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep, ep->hwtid, 0);
-       spin_unlock_irqrestore(&ep->com.dev->lock, flags);
+       return err;
 }
 
 /*
@@ -355,11 +358,11 @@ static struct c4iw_ep *get_ep_from_tid(struct c4iw_dev *dev, unsigned int tid)
        struct c4iw_ep *ep;
        unsigned long flags;
 
-       spin_lock_irqsave(&dev->lock, flags);
-       ep = idr_find(&dev->hwtid_idr, tid);
+       xa_lock_irqsave(&dev->hwtids, flags);
+       ep = xa_load(&dev->hwtids, tid);
        if (ep)
                c4iw_get_ep(&ep->com);
-       spin_unlock_irqrestore(&dev->lock, flags);
+       xa_unlock_irqrestore(&dev->hwtids, flags);
        return ep;
 }
 
@@ -372,11 +375,11 @@ static struct c4iw_listen_ep *get_ep_from_stid(struct c4iw_dev *dev,
        struct c4iw_listen_ep *ep;
        unsigned long flags;
 
-       spin_lock_irqsave(&dev->lock, flags);
-       ep = idr_find(&dev->stid_idr, stid);
+       xa_lock_irqsave(&dev->stids, flags);
+       ep = xa_load(&dev->stids, stid);
        if (ep)
                c4iw_get_ep(&ep->com);
-       spin_unlock_irqrestore(&dev->lock, flags);
+       xa_unlock_irqrestore(&dev->stids, flags);
        return ep;
 }
 
@@ -457,6 +460,8 @@ static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
                skb_reset_transport_header(skb);
        } else {
                skb = alloc_skb(len, gfp);
+               if (!skb)
+                       return NULL;
        }
        t4_set_arp_err_handler(skb, NULL, NULL);
        return skb;
@@ -555,7 +560,7 @@ static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
                cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
                                   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        }
-       remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+       xa_erase_irq(&ep->com.dev->atids, ep->atid);
        cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
        queue_arp_failure_cpl(ep, skb, FAKE_CPL_PUT_EP_SAFE);
 }
@@ -1235,7 +1240,7 @@ static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
        set_emss(ep, tcp_opt);
 
        /* dealloc the atid */
-       remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+       xa_erase_irq(&ep->com.dev->atids, atid);
        cxgb4_free_atid(t, atid);
        set_bit(ACT_ESTAB, &ep->com.history);
 
@@ -2184,7 +2189,9 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
                err = -ENOMEM;
                goto fail2;
        }
-       insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+       err = xa_insert_irq(&ep->com.dev->atids, ep->atid, ep, GFP_KERNEL);
+       if (err)
+               goto fail2a;
 
        /* find a route */
        if (ep->com.cm_id->m_local_addr.ss_family == AF_INET) {
@@ -2236,7 +2243,8 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
 fail4:
        dst_release(ep->dst);
 fail3:
-       remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+       xa_erase_irq(&ep->com.dev->atids, ep->atid);
+fail2a:
        cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 fail2:
        /*
@@ -2319,8 +2327,7 @@ static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
                                                (const u32 *)
                                                &sin6->sin6_addr.s6_addr, 1);
                        }
-                       remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
-                                       atid);
+                       xa_erase_irq(&ep->com.dev->atids, atid);
                        cxgb4_free_atid(t, atid);
                        dst_release(ep->dst);
                        cxgb4_l2t_release(ep->l2t);
@@ -2357,7 +2364,7 @@ fail:
                cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl),
                                 ep->com.local_addr.ss_family);
 
-       remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+       xa_erase_irq(&ep->com.dev->atids, atid);
        cxgb4_free_atid(t, atid);
        dst_release(ep->dst);
        cxgb4_l2t_release(ep->l2t);
@@ -2947,7 +2954,7 @@ out:
                                        (const u32 *)&sin6->sin6_addr.s6_addr,
                                        1);
                }
-               remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+               xa_erase_irq(&ep->com.dev->hwtids, ep->hwtid);
                cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid,
                                 ep->com.local_addr.ss_family);
                dst_release(ep->dst);
@@ -3342,7 +3349,9 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
                err = -ENOMEM;
                goto fail2;
        }
-       insert_handle(dev, &dev->atid_idr, ep, ep->atid);
+       err = xa_insert_irq(&dev->atids, ep->atid, ep, GFP_KERNEL);
+       if (err)
+               goto fail5;
 
        memcpy(&ep->com.local_addr, &cm_id->m_local_addr,
               sizeof(ep->com.local_addr));
@@ -3430,7 +3439,8 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 fail4:
        dst_release(ep->dst);
 fail3:
-       remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+       xa_erase_irq(&ep->com.dev->atids, ep->atid);
+fail5:
        cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
 fail2:
        skb_queue_purge(&ep->com.ep_skb_list);
@@ -3553,7 +3563,9 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
                err = -ENOMEM;
                goto fail2;
        }
-       insert_handle(dev, &dev->stid_idr, ep, ep->stid);
+       err = xa_insert_irq(&dev->stids, ep->stid, ep, GFP_KERNEL);
+       if (err)
+               goto fail3;
 
        state_set(&ep->com, LISTEN);
        if (ep->com.local_addr.ss_family == AF_INET)
@@ -3564,7 +3576,8 @@ int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
                cm_id->provider_data = ep;
                goto out;
        }
-       remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+       xa_erase_irq(&ep->com.dev->stids, ep->stid);
+fail3:
        cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
                        ep->com.local_addr.ss_family);
 fail2:
@@ -3603,7 +3616,7 @@ int c4iw_destroy_listen(struct iw_cm_id *cm_id)
                cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
                                   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        }
-       remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+       xa_erase_irq(&ep->com.dev->stids, ep->stid);
        cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
                        ep->com.local_addr.ss_family);
 done:
@@ -3763,7 +3776,7 @@ static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
                cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0],
                                   (const u32 *)&sin6->sin6_addr.s6_addr, 1);
        }
-       remove_handle(dev, &dev->atid_idr, atid);
+       xa_erase_irq(&dev->atids, atid);
        cxgb4_free_atid(dev->rdev.lldi.tids, atid);
        dst_release(ep->dst);
        cxgb4_l2t_release(ep->l2t);
index 1fd8798..52ce586 100644 (file)
@@ -30,6 +30,8 @@
  * SOFTWARE.
  */
 
+#include <rdma/uverbs_ioctl.h>
+
 #include "iw_cxgb4.h"
 
 static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
@@ -968,7 +970,7 @@ int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
        return !err || err == -ENODATA ? npolled : err;
 }
 
-int c4iw_destroy_cq(struct ib_cq *ib_cq)
+int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct c4iw_cq *chp;
        struct c4iw_ucontext *ucontext;
@@ -976,12 +978,12 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq)
        pr_debug("ib_cq %p\n", ib_cq);
        chp = to_c4iw_cq(ib_cq);
 
-       remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+       xa_erase_irq(&chp->rhp->cqs, chp->cq.cqid);
        atomic_dec(&chp->refcnt);
        wait_event(chp->wait, !atomic_read(&chp->refcnt));
 
-       ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
-                                 : NULL;
+       ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+                                            ibucontext);
        destroy_cq(&chp->rhp->rdev, &chp->cq,
                   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx,
                   chp->destroy_skb, chp->wr_waitp);
@@ -992,7 +994,6 @@ int c4iw_destroy_cq(struct ib_cq *ib_cq)
 
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
                             const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *ib_context,
                             struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -1001,10 +1002,11 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        struct c4iw_cq *chp;
        struct c4iw_create_cq ucmd;
        struct c4iw_create_cq_resp uresp;
-       struct c4iw_ucontext *ucontext = NULL;
        int ret, wr_len;
        size_t memsize, hwentries;
        struct c4iw_mm_entry *mm, *mm2;
+       struct c4iw_ucontext *ucontext = rdma_udata_to_drv_context(
+               udata, struct c4iw_ucontext, ibucontext);
 
        pr_debug("ib_dev %p entries %d\n", ibdev, entries);
        if (attr->flags)
@@ -1015,8 +1017,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        if (vector >= rhp->rdev.lldi.nciq)
                return ERR_PTR(-EINVAL);
 
-       if (ib_context) {
-               ucontext = to_c4iw_ucontext(ib_context);
+       if (udata) {
                if (udata->inlen < sizeof(ucmd))
                        ucontext->is_32b_cqe = 1;
        }
@@ -1068,7 +1069,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        /*
         * memsize must be a multiple of the page size if its a user cq.
         */
-       if (ucontext)
+       if (udata)
                memsize = roundup(memsize, PAGE_SIZE);
 
        chp->cq.size = hwentries;
@@ -1088,7 +1089,7 @@ struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
        spin_lock_init(&chp->comp_handler_lock);
        atomic_set(&chp->refcnt, 1);
        init_waitqueue_head(&chp->wait);
-       ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+       ret = xa_insert_irq(&rhp->cqs, chp->cq.cqid, chp, GFP_KERNEL);
        if (ret)
                goto err_destroy_cq;
 
@@ -1143,7 +1144,7 @@ err_free_mm2:
 err_free_mm:
        kfree(mm);
 err_remove_handle:
-       remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+       xa_erase_irq(&rhp->cqs, chp->cq.cqid);
 err_destroy_cq:
        destroy_cq(&chp->rhp->rdev, &chp->cq,
                   ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
index c79cf63..4c0d925 100644 (file)
@@ -81,14 +81,6 @@ struct c4iw_debugfs_data {
        int pos;
 };
 
-static int count_idrs(int id, void *p, void *data)
-{
-       int *countp = data;
-
-       *countp = *countp + 1;
-       return 0;
-}
-
 static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count,
                            loff_t *ppos)
 {
@@ -250,16 +242,11 @@ static void set_ep_sin6_addrs(struct c4iw_ep *ep,
        }
 }
 
-static int dump_qp(int id, void *p, void *data)
+static int dump_qp(struct c4iw_qp *qp, struct c4iw_debugfs_data *qpd)
 {
-       struct c4iw_qp *qp = p;
-       struct c4iw_debugfs_data *qpd = data;
        int space;
        int cc;
 
-       if (id != qp->wq.sq.qid)
-               return 0;
-
        space = qpd->bufsize - qpd->pos - 1;
        if (space == 0)
                return 1;
@@ -335,7 +322,9 @@ static int qp_release(struct inode *inode, struct file *file)
 
 static int qp_open(struct inode *inode, struct file *file)
 {
+       struct c4iw_qp *qp;
        struct c4iw_debugfs_data *qpd;
+       unsigned long index;
        int count = 1;
 
        qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
@@ -345,9 +334,12 @@ static int qp_open(struct inode *inode, struct file *file)
        qpd->devp = inode->i_private;
        qpd->pos = 0;
 
-       spin_lock_irq(&qpd->devp->lock);
-       idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
-       spin_unlock_irq(&qpd->devp->lock);
+       /*
+        * No need to lock; we drop the lock to call vmalloc so it's racy
+        * anyway.  Someone who cares should switch this over to seq_file
+        */
+       xa_for_each(&qpd->devp->qps, index, qp)
+               count++;
 
        qpd->bufsize = count * 180;
        qpd->buf = vmalloc(qpd->bufsize);
@@ -356,9 +348,10 @@ static int qp_open(struct inode *inode, struct file *file)
                return -ENOMEM;
        }
 
-       spin_lock_irq(&qpd->devp->lock);
-       idr_for_each(&qpd->devp->qpidr, dump_qp, qpd);
-       spin_unlock_irq(&qpd->devp->lock);
+       xa_lock_irq(&qpd->devp->qps);
+       xa_for_each(&qpd->devp->qps, index, qp)
+               dump_qp(qp, qpd);
+       xa_unlock_irq(&qpd->devp->qps);
 
        qpd->buf[qpd->pos++] = 0;
        file->private_data = qpd;
@@ -373,9 +366,8 @@ static const struct file_operations qp_debugfs_fops = {
        .llseek  = default_llseek,
 };
 
-static int dump_stag(int id, void *p, void *data)
+static int dump_stag(unsigned long id, struct c4iw_debugfs_data *stagd)
 {
-       struct c4iw_debugfs_data *stagd = data;
        int space;
        int cc;
        struct fw_ri_tpte tpte;
@@ -424,6 +416,8 @@ static int stag_release(struct inode *inode, struct file *file)
 static int stag_open(struct inode *inode, struct file *file)
 {
        struct c4iw_debugfs_data *stagd;
+       void *p;
+       unsigned long index;
        int ret = 0;
        int count = 1;
 
@@ -435,9 +429,8 @@ static int stag_open(struct inode *inode, struct file *file)
        stagd->devp = inode->i_private;
        stagd->pos = 0;
 
-       spin_lock_irq(&stagd->devp->lock);
-       idr_for_each(&stagd->devp->mmidr, count_idrs, &count);
-       spin_unlock_irq(&stagd->devp->lock);
+       xa_for_each(&stagd->devp->mrs, index, p)
+               count++;
 
        stagd->bufsize = count * 256;
        stagd->buf = vmalloc(stagd->bufsize);
@@ -446,9 +439,10 @@ static int stag_open(struct inode *inode, struct file *file)
                goto err1;
        }
 
-       spin_lock_irq(&stagd->devp->lock);
-       idr_for_each(&stagd->devp->mmidr, dump_stag, stagd);
-       spin_unlock_irq(&stagd->devp->lock);
+       xa_lock_irq(&stagd->devp->mrs);
+       xa_for_each(&stagd->devp->mrs, index, p)
+               dump_stag(index, stagd);
+       xa_unlock_irq(&stagd->devp->mrs);
 
        stagd->buf[stagd->pos++] = 0;
        file->private_data = stagd;
@@ -558,10 +552,8 @@ static const struct file_operations stats_debugfs_fops = {
        .write   = stats_clear,
 };
 
-static int dump_ep(int id, void *p, void *data)
+static int dump_ep(struct c4iw_ep *ep, struct c4iw_debugfs_data *epd)
 {
-       struct c4iw_ep *ep = p;
-       struct c4iw_debugfs_data *epd = data;
        int space;
        int cc;
 
@@ -617,10 +609,9 @@ static int dump_ep(int id, void *p, void *data)
        return 0;
 }
 
-static int dump_listen_ep(int id, void *p, void *data)
+static
+int dump_listen_ep(struct c4iw_listen_ep *ep, struct c4iw_debugfs_data *epd)
 {
-       struct c4iw_listen_ep *ep = p;
-       struct c4iw_debugfs_data *epd = data;
        int space;
        int cc;
 
@@ -674,6 +665,9 @@ static int ep_release(struct inode *inode, struct file *file)
 
 static int ep_open(struct inode *inode, struct file *file)
 {
+       struct c4iw_ep *ep;
+       struct c4iw_listen_ep *lep;
+       unsigned long index;
        struct c4iw_debugfs_data *epd;
        int ret = 0;
        int count = 1;
@@ -686,11 +680,12 @@ static int ep_open(struct inode *inode, struct file *file)
        epd->devp = inode->i_private;
        epd->pos = 0;
 
-       spin_lock_irq(&epd->devp->lock);
-       idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
-       idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
-       idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
-       spin_unlock_irq(&epd->devp->lock);
+       xa_for_each(&epd->devp->hwtids, index, ep)
+               count++;
+       xa_for_each(&epd->devp->atids, index, ep)
+               count++;
+       xa_for_each(&epd->devp->stids, index, lep)
+               count++;
 
        epd->bufsize = count * 240;
        epd->buf = vmalloc(epd->bufsize);
@@ -699,11 +694,18 @@ static int ep_open(struct inode *inode, struct file *file)
                goto err1;
        }
 
-       spin_lock_irq(&epd->devp->lock);
-       idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
-       idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
-       idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
-       spin_unlock_irq(&epd->devp->lock);
+       xa_lock_irq(&epd->devp->hwtids);
+       xa_for_each(&epd->devp->hwtids, index, ep)
+               dump_ep(ep, epd);
+       xa_unlock_irq(&epd->devp->hwtids);
+       xa_lock_irq(&epd->devp->atids);
+       xa_for_each(&epd->devp->atids, index, ep)
+               dump_ep(ep, epd);
+       xa_unlock_irq(&epd->devp->atids);
+       xa_lock_irq(&epd->devp->stids);
+       xa_for_each(&epd->devp->stids, index, lep)
+               dump_listen_ep(lep, epd);
+       xa_unlock_irq(&epd->devp->stids);
 
        file->private_data = epd;
        goto out;
@@ -931,16 +933,12 @@ static void c4iw_rdev_close(struct c4iw_rdev *rdev)
 void c4iw_dealloc(struct uld_ctx *ctx)
 {
        c4iw_rdev_close(&ctx->dev->rdev);
-       WARN_ON_ONCE(!idr_is_empty(&ctx->dev->cqidr));
-       idr_destroy(&ctx->dev->cqidr);
-       WARN_ON_ONCE(!idr_is_empty(&ctx->dev->qpidr));
-       idr_destroy(&ctx->dev->qpidr);
-       WARN_ON_ONCE(!idr_is_empty(&ctx->dev->mmidr));
-       idr_destroy(&ctx->dev->mmidr);
-       wait_event(ctx->dev->wait, idr_is_empty(&ctx->dev->hwtid_idr));
-       idr_destroy(&ctx->dev->hwtid_idr);
-       idr_destroy(&ctx->dev->stid_idr);
-       idr_destroy(&ctx->dev->atid_idr);
+       WARN_ON(!xa_empty(&ctx->dev->cqs));
+       WARN_ON(!xa_empty(&ctx->dev->qps));
+       WARN_ON(!xa_empty(&ctx->dev->mrs));
+       wait_event(ctx->dev->wait, xa_empty(&ctx->dev->hwtids));
+       WARN_ON(!xa_empty(&ctx->dev->stids));
+       WARN_ON(!xa_empty(&ctx->dev->atids));
        if (ctx->dev->rdev.bar2_kva)
                iounmap(ctx->dev->rdev.bar2_kva);
        if (ctx->dev->rdev.oc_mw_kva)
@@ -1044,13 +1042,12 @@ static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
                return ERR_PTR(ret);
        }
 
-       idr_init(&devp->cqidr);
-       idr_init(&devp->qpidr);
-       idr_init(&devp->mmidr);
-       idr_init(&devp->hwtid_idr);
-       idr_init(&devp->stid_idr);
-       idr_init(&devp->atid_idr);
-       spin_lock_init(&devp->lock);
+       xa_init_flags(&devp->cqs, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&devp->qps, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&devp->mrs, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&devp->hwtids, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&devp->atids, XA_FLAGS_LOCK_IRQ);
+       xa_init_flags(&devp->stids, XA_FLAGS_LOCK_IRQ);
        mutex_init(&devp->rdev.stats.lock);
        mutex_init(&devp->db_mutex);
        INIT_LIST_HEAD(&devp->db_fc_list);
@@ -1265,34 +1262,21 @@ static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
        return 0;
 }
 
-static int disable_qp_db(int id, void *p, void *data)
-{
-       struct c4iw_qp *qp = p;
-
-       t4_disable_wq_db(&qp->wq);
-       return 0;
-}
-
 static void stop_queues(struct uld_ctx *ctx)
 {
-       unsigned long flags;
+       struct c4iw_qp *qp;
+       unsigned long index, flags;
 
-       spin_lock_irqsave(&ctx->dev->lock, flags);
+       xa_lock_irqsave(&ctx->dev->qps, flags);
        ctx->dev->rdev.stats.db_state_transitions++;
        ctx->dev->db_state = STOPPED;
-       if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
-               idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
-       else
+       if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+               xa_for_each(&ctx->dev->qps, index, qp)
+                       t4_disable_wq_db(&qp->wq);
+       } else {
                ctx->dev->rdev.status_page->db_off = 1;
-       spin_unlock_irqrestore(&ctx->dev->lock, flags);
-}
-
-static int enable_qp_db(int id, void *p, void *data)
-{
-       struct c4iw_qp *qp = p;
-
-       t4_enable_wq_db(&qp->wq);
-       return 0;
+       }
+       xa_unlock_irqrestore(&ctx->dev->qps, flags);
 }
 
 static void resume_rc_qp(struct c4iw_qp *qp)
@@ -1322,18 +1306,21 @@ static void resume_a_chunk(struct uld_ctx *ctx)
 
 static void resume_queues(struct uld_ctx *ctx)
 {
-       spin_lock_irq(&ctx->dev->lock);
+       xa_lock_irq(&ctx->dev->qps);
        if (ctx->dev->db_state != STOPPED)
                goto out;
        ctx->dev->db_state = FLOW_CONTROL;
        while (1) {
                if (list_empty(&ctx->dev->db_fc_list)) {
+                       struct c4iw_qp *qp;
+                       unsigned long index;
+
                        WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
                        ctx->dev->db_state = NORMAL;
                        ctx->dev->rdev.stats.db_state_transitions++;
                        if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
-                               idr_for_each(&ctx->dev->qpidr, enable_qp_db,
-                                            NULL);
+                               xa_for_each(&ctx->dev->qps, index, qp)
+                                       t4_enable_wq_db(&qp->wq);
                        } else {
                                ctx->dev->rdev.status_page->db_off = 0;
                        }
@@ -1345,12 +1332,12 @@ static void resume_queues(struct uld_ctx *ctx)
                                resume_a_chunk(ctx);
                        }
                        if (!list_empty(&ctx->dev->db_fc_list)) {
-                               spin_unlock_irq(&ctx->dev->lock);
+                               xa_unlock_irq(&ctx->dev->qps);
                                if (DB_FC_RESUME_DELAY) {
                                        set_current_state(TASK_UNINTERRUPTIBLE);
                                        schedule_timeout(DB_FC_RESUME_DELAY);
                                }
-                               spin_lock_irq(&ctx->dev->lock);
+                               xa_lock_irq(&ctx->dev->qps);
                                if (ctx->dev->db_state != FLOW_CONTROL)
                                        break;
                        }
@@ -1359,7 +1346,7 @@ static void resume_queues(struct uld_ctx *ctx)
 out:
        if (ctx->dev->db_state != NORMAL)
                ctx->dev->rdev.stats.db_fc_interruptions++;
-       spin_unlock_irq(&ctx->dev->lock);
+       xa_unlock_irq(&ctx->dev->qps);
 }
 
 struct qp_list {
@@ -1367,23 +1354,6 @@ struct qp_list {
        struct c4iw_qp **qps;
 };
 
-static int add_and_ref_qp(int id, void *p, void *data)
-{
-       struct qp_list *qp_listp = data;
-       struct c4iw_qp *qp = p;
-
-       c4iw_qp_add_ref(&qp->ibqp);
-       qp_listp->qps[qp_listp->idx++] = qp;
-       return 0;
-}
-
-static int count_qps(int id, void *p, void *data)
-{
-       unsigned *countp = data;
-       (*countp)++;
-       return 0;
-}
-
 static void deref_qps(struct qp_list *qp_list)
 {
        int idx;
@@ -1400,7 +1370,7 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
        for (idx = 0; idx < qp_list->idx; idx++) {
                struct c4iw_qp *qp = qp_list->qps[idx];
 
-               spin_lock_irq(&qp->rhp->lock);
+               xa_lock_irq(&qp->rhp->qps);
                spin_lock(&qp->lock);
                ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
                                          qp->wq.sq.qid,
@@ -1410,7 +1380,7 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
                        pr_err("%s: Fatal error - DB overflow recovery failed - error syncing SQ qid %u\n",
                               pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
                        spin_unlock(&qp->lock);
-                       spin_unlock_irq(&qp->rhp->lock);
+                       xa_unlock_irq(&qp->rhp->qps);
                        return;
                }
                qp->wq.sq.wq_pidx_inc = 0;
@@ -1424,12 +1394,12 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
                        pr_err("%s: Fatal error - DB overflow recovery failed - error syncing RQ qid %u\n",
                               pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
                        spin_unlock(&qp->lock);
-                       spin_unlock_irq(&qp->rhp->lock);
+                       xa_unlock_irq(&qp->rhp->qps);
                        return;
                }
                qp->wq.rq.wq_pidx_inc = 0;
                spin_unlock(&qp->lock);
-               spin_unlock_irq(&qp->rhp->lock);
+               xa_unlock_irq(&qp->rhp->qps);
 
                /* Wait for the dbfifo to drain */
                while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
@@ -1441,6 +1411,8 @@ static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
 
 static void recover_queues(struct uld_ctx *ctx)
 {
+       struct c4iw_qp *qp;
+       unsigned long index;
        int count = 0;
        struct qp_list qp_list;
        int ret;
@@ -1458,22 +1430,26 @@ static void recover_queues(struct uld_ctx *ctx)
        }
 
        /* Count active queues so we can build a list of queues to recover */
-       spin_lock_irq(&ctx->dev->lock);
+       xa_lock_irq(&ctx->dev->qps);
        WARN_ON(ctx->dev->db_state != STOPPED);
        ctx->dev->db_state = RECOVERY;
-       idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+       xa_for_each(&ctx->dev->qps, index, qp)
+               count++;
 
        qp_list.qps = kcalloc(count, sizeof(*qp_list.qps), GFP_ATOMIC);
        if (!qp_list.qps) {
-               spin_unlock_irq(&ctx->dev->lock);
+               xa_unlock_irq(&ctx->dev->qps);
                return;
        }
        qp_list.idx = 0;
 
        /* add and ref each qp so it doesn't get freed */
-       idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+       xa_for_each(&ctx->dev->qps, index, qp) {
+               c4iw_qp_add_ref(&qp->ibqp);
+               qp_list.qps[qp_list.idx++] = qp;
+       }
 
-       spin_unlock_irq(&ctx->dev->lock);
+       xa_unlock_irq(&ctx->dev->qps);
 
        /* now traverse the list in a safe context to recover the db state*/
        recover_lost_dbs(ctx, &qp_list);
@@ -1482,10 +1458,10 @@ static void recover_queues(struct uld_ctx *ctx)
        deref_qps(&qp_list);
        kfree(qp_list.qps);
 
-       spin_lock_irq(&ctx->dev->lock);
+       xa_lock_irq(&ctx->dev->qps);
        WARN_ON(ctx->dev->db_state != RECOVERY);
        ctx->dev->db_state = STOPPED;
-       spin_unlock_irq(&ctx->dev->lock);
+       xa_unlock_irq(&ctx->dev->qps);
 }
 
 static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
index 8741d23..4cd877b 100644 (file)
@@ -123,15 +123,15 @@ void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
        struct c4iw_qp *qhp;
        u32 cqid;
 
-       spin_lock_irq(&dev->lock);
-       qhp = get_qhp(dev, CQE_QPID(err_cqe));
+       xa_lock_irq(&dev->qps);
+       qhp = xa_load(&dev->qps, CQE_QPID(err_cqe));
        if (!qhp) {
                pr_err("BAD AE qpid 0x%x opcode %d status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
                       CQE_QPID(err_cqe),
                       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
                       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
                       CQE_WRID_LOW(err_cqe));
-               spin_unlock_irq(&dev->lock);
+               xa_unlock_irq(&dev->qps);
                goto out;
        }
 
@@ -146,13 +146,13 @@ void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
                       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
                       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
                       CQE_WRID_LOW(err_cqe));
-               spin_unlock_irq(&dev->lock);
+               xa_unlock_irq(&dev->qps);
                goto out;
        }
 
        c4iw_qp_add_ref(&qhp->ibqp);
        atomic_inc(&chp->refcnt);
-       spin_unlock_irq(&dev->lock);
+       xa_unlock_irq(&dev->qps);
 
        /* Bad incoming write */
        if (RQ_TYPE(err_cqe) &&
@@ -225,11 +225,11 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
        struct c4iw_cq *chp;
        unsigned long flag;
 
-       spin_lock_irqsave(&dev->lock, flag);
-       chp = get_chp(dev, qid);
+       xa_lock_irqsave(&dev->cqs, flag);
+       chp = xa_load(&dev->cqs, qid);
        if (chp) {
                atomic_inc(&chp->refcnt);
-               spin_unlock_irqrestore(&dev->lock, flag);
+               xa_unlock_irqrestore(&dev->cqs, flag);
                t4_clear_cq_armed(&chp->cq);
                spin_lock_irqsave(&chp->comp_handler_lock, flag);
                (*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
@@ -238,7 +238,7 @@ int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
                        wake_up(&chp->wait);
        } else {
                pr_debug("unknown cqid 0x%x\n", qid);
-               spin_unlock_irqrestore(&dev->lock, flag);
+               xa_unlock_irqrestore(&dev->cqs, flag);
        }
        return 0;
 }
index 5a5da41..916ef98 100644 (file)
@@ -34,7 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/list.h>
 #include <linux/spinlock.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/completion.h>
 #include <linux/netdevice.h>
 #include <linux/sched/mm.h>
@@ -315,16 +315,15 @@ struct c4iw_dev {
        struct ib_device ibdev;
        struct c4iw_rdev rdev;
        u32 device_cap_flags;
-       struct idr cqidr;
-       struct idr qpidr;
-       struct idr mmidr;
-       spinlock_t lock;
+       struct xarray cqs;
+       struct xarray qps;
+       struct xarray mrs;
        struct mutex db_mutex;
        struct dentry *debugfs_root;
        enum db_state db_state;
-       struct idr hwtid_idr;
-       struct idr atid_idr;
-       struct idr stid_idr;
+       struct xarray hwtids;
+       struct xarray atids;
+       struct xarray stids;
        struct list_head db_fc_list;
        u32 avail_ird;
        wait_queue_head_t wait;
@@ -349,70 +348,12 @@ static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
 
 static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
 {
-       return idr_find(&rhp->cqidr, cqid);
+       return xa_load(&rhp->cqs, cqid);
 }
 
 static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
 {
-       return idr_find(&rhp->qpidr, qpid);
-}
-
-static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
-{
-       return idr_find(&rhp->mmidr, mmid);
-}
-
-static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
-                                void *handle, u32 id, int lock)
-{
-       int ret;
-
-       if (lock) {
-               idr_preload(GFP_KERNEL);
-               spin_lock_irq(&rhp->lock);
-       }
-
-       ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC);
-
-       if (lock) {
-               spin_unlock_irq(&rhp->lock);
-               idr_preload_end();
-       }
-
-       return ret < 0 ? ret : 0;
-}
-
-static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
-                               void *handle, u32 id)
-{
-       return _insert_handle(rhp, idr, handle, id, 1);
-}
-
-static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
-                                      void *handle, u32 id)
-{
-       return _insert_handle(rhp, idr, handle, id, 0);
-}
-
-static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
-                                  u32 id, int lock)
-{
-       if (lock)
-               spin_lock_irq(&rhp->lock);
-       idr_remove(idr, id);
-       if (lock)
-               spin_unlock_irq(&rhp->lock);
-}
-
-static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
-{
-       _remove_handle(rhp, idr, id, 1);
-}
-
-static inline void remove_handle_nolock(struct c4iw_dev *rhp,
-                                        struct idr *idr, u32 id)
-{
-       _remove_handle(rhp, idr, id, 0);
+       return xa_load(&rhp->qps, qpid);
 }
 
 extern uint c4iw_max_read_depth;
@@ -1038,9 +979,8 @@ int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
 int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
 void c4iw_qp_add_ref(struct ib_qp *qp);
 void c4iw_qp_rem_ref(struct ib_qp *qp);
-struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
-                           enum ib_mr_type mr_type,
-                           u32 max_num_sg);
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                           u32 max_num_sg, struct ib_udata *udata);
 int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                   unsigned int *sg_offset);
 int c4iw_dealloc_mw(struct ib_mw *mw);
@@ -1051,21 +991,19 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
                                           u64 length, u64 virt, int acc,
                                           struct ib_udata *udata);
 struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
-int c4iw_dereg_mr(struct ib_mr *ib_mr);
-int c4iw_destroy_cq(struct ib_cq *ib_cq);
+int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
+int c4iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 struct ib_cq *c4iw_create_cq(struct ib_device *ibdev,
                             const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *ib_context,
                             struct ib_udata *udata);
 int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int c4iw_modify_srq(struct ib_srq *ib_srq, struct ib_srq_attr *attr,
                    enum ib_srq_attr_mask srq_attr_mask,
                    struct ib_udata *udata);
-int c4iw_destroy_srq(struct ib_srq *ib_srq);
-struct ib_srq *c4iw_create_srq(struct ib_pd *pd,
-                              struct ib_srq_init_attr *attrs,
-                              struct ib_udata *udata);
-int c4iw_destroy_qp(struct ib_qp *ib_qp);
+void c4iw_destroy_srq(struct ib_srq *ib_srq, struct ib_udata *udata);
+int c4iw_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *attrs,
+                   struct ib_udata *udata);
+int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata);
 struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
                             struct ib_qp_init_attr *attrs,
                             struct ib_udata *udata);
index 5baa31a..811c0c8 100644 (file)
@@ -395,7 +395,7 @@ static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
        mhp->ibmr.iova = mhp->attr.va_fbo;
        mhp->ibmr.page_size = 1U << (mhp->attr.page_size + 12);
        pr_debug("mmid 0x%x mhp %p\n", mmid, mhp);
-       return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+       return xa_insert_irq(&mhp->rhp->mrs, mmid, mhp, GFP_KERNEL);
 }
 
 static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
@@ -542,7 +542,7 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 
        shift = PAGE_SHIFT;
 
-       n = mhp->umem->nmap;
+       n = ib_umem_num_pages(mhp->umem);
        err = alloc_pbl(mhp, n);
        if (err)
                goto err_umem_release;
@@ -645,7 +645,7 @@ struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
        mhp->attr.stag = stag;
        mmid = (stag) >> 8;
        mhp->ibmw.rkey = stag;
-       if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+       if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
                ret = -ENOMEM;
                goto dealloc_win;
        }
@@ -673,7 +673,7 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
        mhp = to_c4iw_mw(mw);
        rhp = mhp->rhp;
        mmid = (mw->rkey) >> 8;
-       remove_handle(rhp, &rhp->mmidr, mmid);
+       xa_erase_irq(&rhp->mrs, mmid);
        deallocate_window(&rhp->rdev, mhp->attr.stag, mhp->dereg_skb,
                          mhp->wr_waitp);
        kfree_skb(mhp->dereg_skb);
@@ -683,9 +683,8 @@ int c4iw_dealloc_mw(struct ib_mw *mw)
        return 0;
 }
 
-struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
-                           enum ib_mr_type mr_type,
-                           u32 max_num_sg)
+struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                           u32 max_num_sg, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_pd *php;
@@ -740,7 +739,7 @@ struct ib_mr *c4iw_alloc_mr(struct ib_pd *pd,
        mhp->attr.state = 0;
        mmid = (stag) >> 8;
        mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
-       if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+       if (xa_insert_irq(&rhp->mrs, mmid, mhp, GFP_KERNEL)) {
                ret = -ENOMEM;
                goto err_dereg;
        }
@@ -786,7 +785,7 @@ int c4iw_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
        return ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset, c4iw_set_page);
 }
 
-int c4iw_dereg_mr(struct ib_mr *ib_mr)
+int c4iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_mr *mhp;
@@ -797,7 +796,7 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr)
        mhp = to_c4iw_mr(ib_mr);
        rhp = mhp->rhp;
        mmid = mhp->attr.stag >> 8;
-       remove_handle(rhp, &rhp->mmidr, mmid);
+       xa_erase_irq(&rhp->mrs, mmid);
        if (mhp->mpl)
                dma_free_coherent(&mhp->rhp->rdev.lldi.pdev->dev,
                                  mhp->max_mpl_len, mhp->mpl, mhp->mpl_addr);
@@ -821,9 +820,9 @@ void c4iw_invalidate_mr(struct c4iw_dev *rhp, u32 rkey)
        struct c4iw_mr *mhp;
        unsigned long flags;
 
-       spin_lock_irqsave(&rhp->lock, flags);
-       mhp = get_mhp(rhp, rkey >> 8);
+       xa_lock_irqsave(&rhp->mrs, flags);
+       mhp = xa_load(&rhp->mrs, rkey >> 8);
        if (mhp)
                mhp->attr.state = 0;
-       spin_unlock_irqrestore(&rhp->lock, flags);
+       xa_unlock_irqrestore(&rhp->mrs, flags);
 }
index 507c545..74b7956 100644 (file)
@@ -190,7 +190,7 @@ static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
        return ret;
 }
 
-static void c4iw_deallocate_pd(struct ib_pd *pd)
+static void c4iw_deallocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_pd *php;
@@ -204,8 +204,7 @@ static void c4iw_deallocate_pd(struct ib_pd *pd)
        mutex_unlock(&rhp->rdev.stats.lock);
 }
 
-static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                           struct ib_udata *udata)
+static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct c4iw_pd *php = to_c4iw_pd(pd);
        struct ib_device *ibdev = pd->device;
@@ -220,11 +219,11 @@ static int c4iw_allocate_pd(struct ib_pd *pd, struct ib_ucontext *context,
 
        php->pdid = pdid;
        php->rhp = rhp;
-       if (context) {
+       if (udata) {
                struct c4iw_alloc_pd_resp uresp = {.pdid = php->pdid};
 
                if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
-                       c4iw_deallocate_pd(&php->ibpd);
+                       c4iw_deallocate_pd(&php->ibpd, udata);
                        return -EFAULT;
                }
        }
@@ -483,24 +482,6 @@ static void get_dev_fw_str(struct ib_device *dev, char *str)
                 FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
 }
 
-static struct net_device *get_netdev(struct ib_device *dev, u8 port)
-{
-       struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev, ibdev);
-       struct c4iw_rdev *rdev = &c4iw_dev->rdev;
-       struct net_device *ndev;
-
-       if (!port || port > rdev->lldi.nports)
-               return NULL;
-
-       rcu_read_lock();
-       ndev = rdev->lldi.ports[port - 1];
-       if (ndev)
-               dev_hold(ndev);
-       rcu_read_unlock();
-
-       return ndev;
-}
-
 static int fill_res_entry(struct sk_buff *msg, struct rdma_restrack_entry *res)
 {
        return (res->type < ARRAY_SIZE(c4iw_restrack_funcs) &&
@@ -528,8 +509,15 @@ static const struct ib_device_ops c4iw_dev_ops = {
        .get_dev_fw_str = get_dev_fw_str,
        .get_dma_mr = c4iw_get_dma_mr,
        .get_hw_stats = c4iw_get_mib,
-       .get_netdev = get_netdev,
        .get_port_immutable = c4iw_port_immutable,
+       .iw_accept = c4iw_accept_cr,
+       .iw_add_ref = c4iw_qp_add_ref,
+       .iw_connect = c4iw_connect,
+       .iw_create_listen = c4iw_create_listen,
+       .iw_destroy_listen = c4iw_destroy_listen,
+       .iw_get_qp = c4iw_get_qp,
+       .iw_reject = c4iw_reject_cr,
+       .iw_rem_ref = c4iw_qp_rem_ref,
        .map_mr_sg = c4iw_map_mr_sg,
        .mmap = c4iw_mmap,
        .modify_qp = c4iw_ib_modify_qp,
@@ -546,9 +534,24 @@ static const struct ib_device_ops c4iw_dev_ops = {
        .reg_user_mr = c4iw_reg_user_mr,
        .req_notify_cq = c4iw_arm_cq,
        INIT_RDMA_OBJ_SIZE(ib_pd, c4iw_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, c4iw_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, c4iw_ucontext, ibucontext),
 };
 
+static int set_netdevs(struct ib_device *ib_dev, struct c4iw_rdev *rdev)
+{
+       int ret;
+       int i;
+
+       for (i = 0; i < rdev->lldi.nports; i++) {
+               ret = ib_device_set_netdev(ib_dev, rdev->lldi.ports[i],
+                                          i + 1);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
 void c4iw_register_device(struct work_struct *work)
 {
        int ret;
@@ -593,33 +596,20 @@ void c4iw_register_device(struct work_struct *work)
        dev->ibdev.dev.parent = &dev->rdev.lldi.pdev->dev;
        dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
 
-       dev->ibdev.iwcm = kzalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
-       if (!dev->ibdev.iwcm) {
-               ret = -ENOMEM;
-               goto err_dealloc_ctx;
-       }
-
-       dev->ibdev.iwcm->connect = c4iw_connect;
-       dev->ibdev.iwcm->accept = c4iw_accept_cr;
-       dev->ibdev.iwcm->reject = c4iw_reject_cr;
-       dev->ibdev.iwcm->create_listen = c4iw_create_listen;
-       dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
-       dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
-       dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
-       dev->ibdev.iwcm->get_qp = c4iw_get_qp;
-       memcpy(dev->ibdev.iwcm->ifname, dev->rdev.lldi.ports[0]->name,
-              sizeof(dev->ibdev.iwcm->ifname));
+       memcpy(dev->ibdev.iw_ifname, dev->rdev.lldi.ports[0]->name,
+              sizeof(dev->ibdev.iw_ifname));
 
        rdma_set_device_sysfs_group(&dev->ibdev, &c4iw_attr_group);
        dev->ibdev.driver_id = RDMA_DRIVER_CXGB4;
        ib_set_device_ops(&dev->ibdev, &c4iw_dev_ops);
+       ret = set_netdevs(&dev->ibdev, &dev->rdev);
+       if (ret)
+               goto err_dealloc_ctx;
        ret = ib_register_device(&dev->ibdev, "cxgb4_%d");
        if (ret)
-               goto err_kfree_iwcm;
+               goto err_dealloc_ctx;
        return;
 
-err_kfree_iwcm:
-       kfree(dev->ibdev.iwcm);
 err_dealloc_ctx:
        pr_err("%s - Failed registering iwarp device: %d\n",
               pci_name(ctx->lldi.pdev), ret);
@@ -631,6 +621,5 @@ void c4iw_unregister_device(struct c4iw_dev *dev)
 {
        pr_debug("c4iw_dev %p\n", dev);
        ib_unregister_device(&dev->ibdev);
-       kfree(dev->ibdev.iwcm);
        return;
 }
index d3a8283..e92b954 100644 (file)
@@ -57,18 +57,18 @@ MODULE_PARM_DESC(db_coalescing_threshold,
 
 static int max_fr_immd = T4_MAX_FR_IMMD;
 module_param(max_fr_immd, int, 0644);
-MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate");
+MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immediate");
 
 static int alloc_ird(struct c4iw_dev *dev, u32 ird)
 {
        int ret = 0;
 
-       spin_lock_irq(&dev->lock);
+       xa_lock_irq(&dev->qps);
        if (ird <= dev->avail_ird)
                dev->avail_ird -= ird;
        else
                ret = -ENOMEM;
-       spin_unlock_irq(&dev->lock);
+       xa_unlock_irq(&dev->qps);
 
        if (ret)
                dev_warn(&dev->rdev.lldi.pdev->dev,
@@ -79,9 +79,9 @@ static int alloc_ird(struct c4iw_dev *dev, u32 ird)
 
 static void free_ird(struct c4iw_dev *dev, int ird)
 {
-       spin_lock_irq(&dev->lock);
+       xa_lock_irq(&dev->qps);
        dev->avail_ird += ird;
-       spin_unlock_irq(&dev->lock);
+       xa_unlock_irq(&dev->qps);
 }
 
 static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
@@ -939,7 +939,7 @@ static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&qhp->rhp->lock, flags);
+       xa_lock_irqsave(&qhp->rhp->qps, flags);
        spin_lock(&qhp->lock);
        if (qhp->rhp->db_state == NORMAL)
                t4_ring_sq_db(&qhp->wq, inc, NULL);
@@ -948,7 +948,7 @@ static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc)
                qhp->wq.sq.wq_pidx_inc += inc;
        }
        spin_unlock(&qhp->lock);
-       spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+       xa_unlock_irqrestore(&qhp->rhp->qps, flags);
        return 0;
 }
 
@@ -956,7 +956,7 @@ static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&qhp->rhp->lock, flags);
+       xa_lock_irqsave(&qhp->rhp->qps, flags);
        spin_lock(&qhp->lock);
        if (qhp->rhp->db_state == NORMAL)
                t4_ring_rq_db(&qhp->wq, inc, NULL);
@@ -965,7 +965,7 @@ static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
                qhp->wq.rq.wq_pidx_inc += inc;
        }
        spin_unlock(&qhp->lock);
-       spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+       xa_unlock_irqrestore(&qhp->rhp->qps, flags);
        return 0;
 }
 
@@ -1976,10 +1976,10 @@ int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
                        qhp->attr.layer_etype = attrs->layer_etype;
                        qhp->attr.ecode = attrs->ecode;
                        ep = qhp->ep;
+                       c4iw_get_ep(&ep->com);
+                       disconnect = 1;
                        if (!internal) {
-                               c4iw_get_ep(&qhp->ep->com);
                                terminate = 1;
-                               disconnect = 1;
                        } else {
                                terminate = qhp->attr.send_term;
                                ret = rdma_fini(rhp, qhp, ep);
@@ -2095,7 +2095,7 @@ out:
        return ret;
 }
 
-int c4iw_destroy_qp(struct ib_qp *ib_qp)
+int c4iw_destroy_qp(struct ib_qp *ib_qp, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_qp *qhp;
@@ -2111,12 +2111,11 @@ int c4iw_destroy_qp(struct ib_qp *ib_qp)
                c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
        wait_event(qhp->wait, !qhp->ep);
 
-       remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
-
-       spin_lock_irq(&rhp->lock);
+       xa_lock_irq(&rhp->qps);
+       __xa_erase(&rhp->qps, qhp->wq.sq.qid);
        if (!list_empty(&qhp->db_fc_entry))
                list_del_init(&qhp->db_fc_entry);
-       spin_unlock_irq(&rhp->lock);
+       xa_unlock_irq(&rhp->qps);
        free_ird(rhp, qhp->attr.max_ird);
 
        c4iw_qp_rem_ref(ib_qp);
@@ -2234,7 +2233,7 @@ struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
        kref_init(&qhp->kref);
        INIT_WORK(&qhp->free_work, free_qp_work);
 
-       ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+       ret = xa_insert_irq(&rhp->qps, qhp->wq.sq.qid, qhp, GFP_KERNEL);
        if (ret)
                goto err_destroy_qp;
 
@@ -2370,7 +2369,7 @@ err_free_rq_key:
 err_free_sq_key:
        kfree(sq_key_mm);
 err_remove_handle:
-       remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+       xa_erase_irq(&rhp->qps, qhp->wq.sq.qid);
 err_destroy_qp:
        destroy_qp(&rhp->rdev, &qhp->wq,
                   ucontext ? &ucontext->uctx : &rhp->rdev.uctx, !attrs->srq);
@@ -2684,11 +2683,12 @@ void c4iw_copy_wr_to_srq(struct t4_srq *srq, union t4_recv_wr *wqe, u8 len16)
        }
 }
 
-struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
+int c4iw_create_srq(struct ib_srq *ib_srq, struct ib_srq_init_attr *attrs,
                               struct ib_udata *udata)
 {
+       struct ib_pd *pd = ib_srq->pd;
        struct c4iw_dev *rhp;
-       struct c4iw_srq *srq;
+       struct c4iw_srq *srq = to_c4iw_srq(ib_srq);
        struct c4iw_pd *php;
        struct c4iw_create_srq_resp uresp;
        struct c4iw_ucontext *ucontext;
@@ -2703,11 +2703,11 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
        rhp = php->rhp;
 
        if (!rhp->rdev.lldi.vr->srq.size)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        if (attrs->attr.max_wr > rhp->rdev.hw_queue.t4_max_rq_size)
-               return ERR_PTR(-E2BIG);
+               return -E2BIG;
        if (attrs->attr.max_sge > T4_MAX_RECV_SGE)
-               return ERR_PTR(-E2BIG);
+               return -E2BIG;
 
        /*
         * SRQ RQT and RQ must be a power of 2 and at least 16 deep.
@@ -2718,15 +2718,9 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
        ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
                                             ibucontext);
 
-       srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
-
        srq->wr_waitp = c4iw_alloc_wr_wait(GFP_KERNEL);
-       if (!srq->wr_waitp) {
-               ret = -ENOMEM;
-               goto err_free_srq;
-       }
+       if (!srq->wr_waitp)
+               return -ENOMEM;
 
        srq->idx = c4iw_alloc_srq_idx(&rhp->rdev);
        if (srq->idx < 0) {
@@ -2760,7 +2754,7 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
        if (CHELSIO_CHIP_VERSION(rhp->rdev.lldi.adapter_type) > CHELSIO_T6)
                srq->flags = T4_SRQ_LIMIT_SUPPORT;
 
-       ret = insert_handle(rhp, &rhp->qpidr, srq, srq->wq.qid);
+       ret = xa_insert_irq(&rhp->qps, srq->wq.qid, srq, GFP_KERNEL);
        if (ret)
                goto err_free_queue;
 
@@ -2806,13 +2800,14 @@ struct ib_srq *c4iw_create_srq(struct ib_pd *pd, struct ib_srq_init_attr *attrs,
                        (unsigned long)srq->wq.memsize, attrs->attr.max_wr);
 
        spin_lock_init(&srq->lock);
-       return &srq->ibsrq;
+       return 0;
+
 err_free_srq_db_key_mm:
        kfree(srq_db_key_mm);
 err_free_srq_key_mm:
        kfree(srq_key_mm);
 err_remove_handle:
-       remove_handle(rhp, &rhp->qpidr, srq->wq.qid);
+       xa_erase_irq(&rhp->qps, srq->wq.qid);
 err_free_queue:
        free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
                       srq->wr_waitp);
@@ -2822,12 +2817,10 @@ err_free_srq_idx:
        c4iw_free_srq_idx(&rhp->rdev, srq->idx);
 err_free_wr_wait:
        c4iw_put_wr_wait(srq->wr_waitp);
-err_free_srq:
-       kfree(srq);
-       return ERR_PTR(ret);
+       return ret;
 }
 
-int c4iw_destroy_srq(struct ib_srq *ibsrq)
+void c4iw_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
        struct c4iw_dev *rhp;
        struct c4iw_srq *srq;
@@ -2838,13 +2831,11 @@ int c4iw_destroy_srq(struct ib_srq *ibsrq)
 
        pr_debug("%s id %d\n", __func__, srq->wq.qid);
 
-       remove_handle(rhp, &rhp->qpidr, srq->wq.qid);
-       ucontext = ibsrq->uobject ?
-               to_c4iw_ucontext(ibsrq->uobject->context) : NULL;
+       xa_erase_irq(&rhp->qps, srq->wq.qid);
+       ucontext = rdma_udata_to_drv_context(udata, struct c4iw_ucontext,
+                                            ibucontext);
        free_srq_queue(srq, ucontext ? &ucontext->uctx : &rhp->rdev.uctx,
                       srq->wr_waitp);
        c4iw_free_srq_idx(&rhp->rdev, srq->idx);
        c4iw_put_wr_wait(srq->wr_waitp);
-       kfree(srq);
-       return 0;
 }
diff --git a/drivers/infiniband/hw/efa/Kconfig b/drivers/infiniband/hw/efa/Kconfig
new file mode 100644 (file)
index 0000000..457e18b
--- /dev/null
@@ -0,0 +1,15 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# Amazon fabric device configuration
+#
+
+config INFINIBAND_EFA
+       tristate "Amazon Elastic Fabric Adapter (EFA) support"
+       depends on PCI_MSI && 64BIT && !CPU_BIG_ENDIAN
+       depends on INFINIBAND_USER_ACCESS
+       help
+         This driver supports Amazon Elastic Fabric Adapter (EFA).
+
+         To compile this driver as a module, choose M here.
+         The module will be called efa.
diff --git a/drivers/infiniband/hw/efa/Makefile b/drivers/infiniband/hw/efa/Makefile
new file mode 100644 (file)
index 0000000..6e83083
--- /dev/null
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+# Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+#
+# Makefile for Amazon Elastic Fabric Adapter (EFA) device driver.
+#
+
+obj-$(CONFIG_INFINIBAND_EFA) += efa.o
+
+efa-y := efa_com_cmd.o efa_com.o efa_main.o efa_verbs.o
diff --git a/drivers/infiniband/hw/efa/efa.h b/drivers/infiniband/hw/efa/efa.h
new file mode 100644 (file)
index 0000000..9e3cc32
--- /dev/null
@@ -0,0 +1,163 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_H_
+#define _EFA_H_
+
+#include <linux/bitops.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/sched.h>
+
+#include <rdma/efa-abi.h>
+#include <rdma/ib_verbs.h>
+
+#include "efa_com_cmd.h"
+
+#define DRV_MODULE_NAME         "efa"
+#define DEVICE_NAME             "Elastic Fabric Adapter (EFA)"
+
+#define EFA_IRQNAME_SIZE        40
+
+/* 1 for AENQ + ADMIN */
+#define EFA_NUM_MSIX_VEC                  1
+#define EFA_MGMNT_MSIX_VEC_IDX            0
+
+struct efa_irq {
+       irq_handler_t handler;
+       void *data;
+       int cpu;
+       u32 vector;
+       cpumask_t affinity_hint_mask;
+       char name[EFA_IRQNAME_SIZE];
+};
+
+struct efa_sw_stats {
+       atomic64_t alloc_pd_err;
+       atomic64_t create_qp_err;
+       atomic64_t create_cq_err;
+       atomic64_t reg_mr_err;
+       atomic64_t alloc_ucontext_err;
+       atomic64_t create_ah_err;
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_stats {
+       struct efa_sw_stats sw_stats;
+       atomic64_t keep_alive_rcvd;
+};
+
+struct efa_dev {
+       struct ib_device ibdev;
+       struct efa_com_dev edev;
+       struct pci_dev *pdev;
+       struct efa_com_get_device_attr_result dev_attr;
+
+       u64 reg_bar_addr;
+       u64 reg_bar_len;
+       u64 mem_bar_addr;
+       u64 mem_bar_len;
+       u64 db_bar_addr;
+       u64 db_bar_len;
+       u8 addr[EFA_GID_SIZE];
+       u32 mtu;
+
+       int admin_msix_vector_idx;
+       struct efa_irq admin_irq;
+
+       struct efa_stats stats;
+};
+
+struct efa_ucontext {
+       struct ib_ucontext ibucontext;
+       struct xarray mmap_xa;
+       u32 mmap_xa_page;
+       u16 uarn;
+};
+
+struct efa_pd {
+       struct ib_pd ibpd;
+       u16 pdn;
+};
+
+struct efa_mr {
+       struct ib_mr ibmr;
+       struct ib_umem *umem;
+};
+
+struct efa_cq {
+       struct ib_cq ibcq;
+       struct efa_ucontext *ucontext;
+       dma_addr_t dma_addr;
+       void *cpu_addr;
+       size_t size;
+       u16 cq_idx;
+};
+
+struct efa_qp {
+       struct ib_qp ibqp;
+       dma_addr_t rq_dma_addr;
+       void *rq_cpu_addr;
+       size_t rq_size;
+       enum ib_qp_state state;
+       u32 qp_handle;
+       u32 max_send_wr;
+       u32 max_recv_wr;
+       u32 max_send_sge;
+       u32 max_recv_sge;
+       u32 max_inline_data;
+};
+
+struct efa_ah {
+       struct ib_ah ibah;
+       u16 ah;
+       /* dest_addr */
+       u8 id[EFA_GID_SIZE];
+};
+
+int efa_query_device(struct ib_device *ibdev,
+                    struct ib_device_attr *props,
+                    struct ib_udata *udata);
+int efa_query_port(struct ib_device *ibdev, u8 port,
+                  struct ib_port_attr *props);
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+                int qp_attr_mask,
+                struct ib_qp_init_attr *qp_init_attr);
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+                 union ib_gid *gid);
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                  u16 *pkey);
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+                           struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata);
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
+struct ib_cq *efa_create_cq(struct ib_device *ibdev,
+                           const struct ib_cq_init_attr *attr,
+                           struct ib_udata *udata);
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+                        u64 virt_addr, int access_flags,
+                        struct ib_udata *udata);
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+                          struct ib_port_immutable *immutable);
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata);
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext);
+int efa_mmap(struct ib_ucontext *ibucontext,
+            struct vm_area_struct *vma);
+int efa_create_ah(struct ib_ah *ibah,
+                 struct rdma_ah_attr *ah_attr,
+                 u32 flags,
+                 struct ib_udata *udata);
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags);
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_udata *udata);
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+                                        u8 port_num);
+
+#endif /* _EFA_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h b/drivers/infiniband/hw/efa/efa_admin_cmds_defs.h
new file mode 100644 (file)
index 0000000..2be0469
--- /dev/null
@@ -0,0 +1,794 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_CMDS_H_
+#define _EFA_ADMIN_CMDS_H_
+
+#define EFA_ADMIN_API_VERSION_MAJOR          0
+#define EFA_ADMIN_API_VERSION_MINOR          1
+
+/* EFA admin queue opcodes */
+enum efa_admin_aq_opcode {
+       EFA_ADMIN_CREATE_QP                         = 1,
+       EFA_ADMIN_MODIFY_QP                         = 2,
+       EFA_ADMIN_QUERY_QP                          = 3,
+       EFA_ADMIN_DESTROY_QP                        = 4,
+       EFA_ADMIN_CREATE_AH                         = 5,
+       EFA_ADMIN_DESTROY_AH                        = 6,
+       EFA_ADMIN_REG_MR                            = 7,
+       EFA_ADMIN_DEREG_MR                          = 8,
+       EFA_ADMIN_CREATE_CQ                         = 9,
+       EFA_ADMIN_DESTROY_CQ                        = 10,
+       EFA_ADMIN_GET_FEATURE                       = 11,
+       EFA_ADMIN_SET_FEATURE                       = 12,
+       EFA_ADMIN_GET_STATS                         = 13,
+       EFA_ADMIN_ALLOC_PD                          = 14,
+       EFA_ADMIN_DEALLOC_PD                        = 15,
+       EFA_ADMIN_ALLOC_UAR                         = 16,
+       EFA_ADMIN_DEALLOC_UAR                       = 17,
+       EFA_ADMIN_MAX_OPCODE                        = 17,
+};
+
+enum efa_admin_aq_feature_id {
+       EFA_ADMIN_DEVICE_ATTR                       = 1,
+       EFA_ADMIN_AENQ_CONFIG                       = 2,
+       EFA_ADMIN_NETWORK_ATTR                      = 3,
+       EFA_ADMIN_QUEUE_ATTR                        = 4,
+       EFA_ADMIN_HW_HINTS                          = 5,
+       EFA_ADMIN_FEATURES_OPCODE_NUM               = 8,
+};
+
+/* QP transport type */
+enum efa_admin_qp_type {
+       /* Unreliable Datagram */
+       EFA_ADMIN_QP_TYPE_UD                        = 1,
+       /* Scalable Reliable Datagram */
+       EFA_ADMIN_QP_TYPE_SRD                       = 2,
+};
+
+/* QP state */
+enum efa_admin_qp_state {
+       EFA_ADMIN_QP_STATE_RESET                    = 0,
+       EFA_ADMIN_QP_STATE_INIT                     = 1,
+       EFA_ADMIN_QP_STATE_RTR                      = 2,
+       EFA_ADMIN_QP_STATE_RTS                      = 3,
+       EFA_ADMIN_QP_STATE_SQD                      = 4,
+       EFA_ADMIN_QP_STATE_SQE                      = 5,
+       EFA_ADMIN_QP_STATE_ERR                      = 6,
+};
+
+enum efa_admin_get_stats_type {
+       EFA_ADMIN_GET_STATS_TYPE_BASIC              = 0,
+};
+
+enum efa_admin_get_stats_scope {
+       EFA_ADMIN_GET_STATS_SCOPE_ALL               = 0,
+       EFA_ADMIN_GET_STATS_SCOPE_QUEUE             = 1,
+};
+
+enum efa_admin_modify_qp_mask_bits {
+       EFA_ADMIN_QP_STATE_BIT                      = 0,
+       EFA_ADMIN_CUR_QP_STATE_BIT                  = 1,
+       EFA_ADMIN_QKEY_BIT                          = 2,
+       EFA_ADMIN_SQ_PSN_BIT                        = 3,
+       EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT       = 4,
+};
+
+/*
+ * QP allocation sizes, converted by fabric QueuePair (QP) create command
+ * from QP capabilities.
+ */
+struct efa_admin_qp_alloc_size {
+       /* Send descriptor ring size in bytes */
+       u32 send_queue_ring_size;
+
+       /* Max number of WQEs that can be outstanding on send queue. */
+       u32 send_queue_depth;
+
+       /*
+        * Recv descriptor ring size in bytes, sufficient for user-provided
+        * number of WQEs
+        */
+       u32 recv_queue_ring_size;
+
+       /* Max number of WQEs that can be outstanding on recv queue */
+       u32 recv_queue_depth;
+};
+
+struct efa_admin_create_qp_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* Protection Domain associated with this QP */
+       u16 pd;
+
+       /* QP type */
+       u8 qp_type;
+
+       /*
+        * 0 : sq_virt - If set, SQ ring base address is
+        *    virtual (IOVA returned by MR registration)
+        * 1 : rq_virt - If set, RQ ring base address is
+        *    virtual (IOVA returned by MR registration)
+        * 7:2 : reserved - MBZ
+        */
+       u8 flags;
+
+       /*
+        * Send queue (SQ) ring base physical address. This field is not
+        * used if this is a Low Latency Queue(LLQ).
+        */
+       u64 sq_base_addr;
+
+       /* Receive queue (RQ) ring base address. */
+       u64 rq_base_addr;
+
+       /* Index of CQ to be associated with Send Queue completions */
+       u32 send_cq_idx;
+
+       /* Index of CQ to be associated with Recv Queue completions */
+       u32 recv_cq_idx;
+
+       /*
+        * Memory registration key for the SQ ring, used only when not in
+        * LLQ mode and base address is virtual
+        */
+       u32 sq_l_key;
+
+       /*
+        * Memory registration key for the RQ ring, used only when base
+        * address is virtual
+        */
+       u32 rq_l_key;
+
+       /* Requested QP allocation sizes */
+       struct efa_admin_qp_alloc_size qp_alloc_size;
+
+       /* UAR number */
+       u16 uar;
+
+       /* MBZ */
+       u16 reserved;
+
+       /* MBZ */
+       u32 reserved2;
+};
+
+struct efa_admin_create_qp_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /* Opaque handle to be used for consequent operations on the QP */
+       u32 qp_handle;
+
+       /* QP number in the given EFA virtual device */
+       u16 qp_num;
+
+       /* MBZ */
+       u16 reserved;
+
+       /* Index of sub-CQ for Send Queue completions */
+       u16 send_sub_cq_idx;
+
+       /* Index of sub-CQ for Receive Queue completions */
+       u16 recv_sub_cq_idx;
+
+       /* SQ doorbell address, as offset to PCIe DB BAR */
+       u32 sq_db_offset;
+
+       /* RQ doorbell address, as offset to PCIe DB BAR */
+       u32 rq_db_offset;
+
+       /*
+        * low latency send queue ring base address as an offset to PCIe
+        * MMIO LLQ_MEM BAR
+        */
+       u32 llq_descriptors_offset;
+};
+
+struct efa_admin_modify_qp_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /*
+        * Mask indicating which fields should be updated see enum
+        * efa_admin_modify_qp_mask_bits
+        */
+       u32 modify_mask;
+
+       /* QP handle returned by create_qp command */
+       u32 qp_handle;
+
+       /* QP state */
+       u32 qp_state;
+
+       /* Override current QP state (before applying the transition) */
+       u32 cur_qp_state;
+
+       /* QKey */
+       u32 qkey;
+
+       /* SQ PSN */
+       u32 sq_psn;
+
+       /* Enable async notification when SQ is drained */
+       u8 sq_drained_async_notify;
+
+       /* MBZ */
+       u8 reserved1;
+
+       /* MBZ */
+       u16 reserved2;
+};
+
+struct efa_admin_modify_qp_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_query_qp_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* QP handle returned by create_qp command */
+       u32 qp_handle;
+};
+
+struct efa_admin_query_qp_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /* QP state */
+       u32 qp_state;
+
+       /* QKey */
+       u32 qkey;
+
+       /* SQ PSN */
+       u32 sq_psn;
+
+       /* Indicates that draining is in progress */
+       u8 sq_draining;
+
+       /* MBZ */
+       u8 reserved1;
+
+       /* MBZ */
+       u16 reserved2;
+};
+
+struct efa_admin_destroy_qp_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* QP handle returned by create_qp command */
+       u32 qp_handle;
+};
+
+struct efa_admin_destroy_qp_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Create Address Handle command parameters. Must not be called more than
+ * once for the same destination
+ */
+struct efa_admin_create_ah_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* Destination address in network byte order */
+       u8 dest_addr[16];
+
+       /* PD number */
+       u16 pd;
+
+       u16 reserved;
+};
+
+struct efa_admin_create_ah_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /* Target interface address handle (opaque) */
+       u16 ah;
+
+       u16 reserved;
+};
+
+struct efa_admin_destroy_ah_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* Target interface address handle (opaque) */
+       u16 ah;
+
+       /* PD number */
+       u16 pd;
+};
+
+struct efa_admin_destroy_ah_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * Registration of MemoryRegion, required for QP working with Virtual
+ * Addresses. In standard verbs semantics, region length is limited to 2GB
+ * space, but EFA offers larger MR support for large memory space, to ease
+ * on users working with very large datasets (i.e. full GPU memory mapping).
+ */
+struct efa_admin_reg_mr_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* Protection Domain */
+       u16 pd;
+
+       /* MBZ */
+       u16 reserved16_w1;
+
+       /* Physical Buffer List, each element is page-aligned. */
+       union {
+               /*
+                * Inline array of guest-physical page addresses of user
+                * memory pages (optimization for short region
+                * registrations)
+                */
+               u64 inline_pbl_array[4];
+
+               /* points to PBL (direct or indirect, chained if needed) */
+               struct efa_admin_ctrl_buff_info pbl;
+       } pbl;
+
+       /* Memory region length, in bytes. */
+       u64 mr_length;
+
+       /*
+        * flags and page size
+        * 4:0 : phys_page_size_shift - page size is (1 <<
+        *    phys_page_size_shift). Page size is used for
+        *    building the Virtual to Physical address mapping
+        * 6:5 : reserved - MBZ
+        * 7 : mem_addr_phy_mode_en - Enable bit for physical
+        *    memory registration (no translation), can be used
+        *    only by privileged clients. If set, PBL must
+        *    contain a single entry.
+        */
+       u8 flags;
+
+       /*
+        * permissions
+        * 0 : local_write_enable - Write permissions: value
+        *    of 1 needed for RQ buffers and for RDMA write
+        * 7:1 : reserved1 - remote access flags, etc
+        */
+       u8 permissions;
+
+       u16 reserved16_w5;
+
+       /* number of pages in PBL (redundant, could be calculated) */
+       u32 page_num;
+
+       /*
+        * IO Virtual Address associated with this MR. If
+        * mem_addr_phy_mode_en is set, contains the physical address of
+        * the region.
+        */
+       u64 iova;
+};
+
+struct efa_admin_reg_mr_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /*
+        * L_Key, to be used in conjunction with local buffer references in
+        * SQ and RQ WQE, or with virtual RQ/CQ rings
+        */
+       u32 l_key;
+
+       /*
+        * R_Key, to be used in RDMA messages to refer to remotely accessed
+        * memory region
+        */
+       u32 r_key;
+};
+
+struct efa_admin_dereg_mr_cmd {
+       /* Common Admin Queue descriptor */
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /* L_Key, memory region's l_key */
+       u32 l_key;
+};
+
+struct efa_admin_dereg_mr_resp {
+       /* Common Admin Queue completion descriptor */
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_create_cq_cmd {
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       /*
+        * 4:0 : reserved5
+        * 5 : interrupt_mode_enabled - if set, cq operates
+        *    in interrupt mode (i.e. CQ events and MSI-X are
+        *    generated), otherwise - polling
+        * 6 : virt - If set, ring base address is virtual
+        *    (IOVA returned by MR registration)
+        * 7 : reserved6
+        */
+       u8 cq_caps_1;
+
+       /*
+        * 4:0 : cq_entry_size_words - size of CQ entry in
+        *    32-bit words, valid values: 4, 8.
+        * 7:5 : reserved7
+        */
+       u8 cq_caps_2;
+
+       /* completion queue depth in # of entries. must be power of 2 */
+       u16 cq_depth;
+
+       /* msix vector assigned to this cq */
+       u32 msix_vector_idx;
+
+       /*
+        * CQ ring base address, virtual or physical depending on 'virt'
+        * flag
+        */
+       struct efa_common_mem_addr cq_ba;
+
+       /*
+        * Memory registration key for the ring, used only when base
+        * address is virtual
+        */
+       u32 l_key;
+
+       /*
+        * number of sub cqs - must be equal to sub_cqs_per_cq of queue
+        *    attributes.
+        */
+       u16 num_sub_cqs;
+
+       /* UAR number */
+       u16 uar;
+};
+
+struct efa_admin_create_cq_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       u16 cq_idx;
+
+       /* actual cq depth in number of entries */
+       u16 cq_actual_depth;
+};
+
+struct efa_admin_destroy_cq_cmd {
+       struct efa_admin_aq_common_desc aq_common_desc;
+
+       u16 cq_idx;
+
+       u16 reserved1;
+};
+
+struct efa_admin_destroy_cq_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/*
+ * EFA AQ Get Statistics command. Extended statistics are placed in control
+ * buffer pointed by AQ entry
+ */
+struct efa_admin_aq_get_stats_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       union {
+               /* command specific inline data */
+               u32 inline_data_w1[3];
+
+               struct efa_admin_ctrl_buff_info control_buffer;
+       } u;
+
+       /* stats type as defined in enum efa_admin_get_stats_type */
+       u8 type;
+
+       /* stats scope defined in enum efa_admin_get_stats_scope */
+       u8 scope;
+
+       u16 scope_modifier;
+};
+
+struct efa_admin_basic_stats {
+       u64 tx_bytes;
+
+       u64 tx_pkts;
+
+       u64 rx_bytes;
+
+       u64 rx_pkts;
+
+       u64 rx_drops;
+};
+
+struct efa_admin_acq_get_stats_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       struct efa_admin_basic_stats basic_stats;
+};
+
+struct efa_admin_get_set_feature_common_desc {
+       /*
+        * 1:0 : select - 0x1 - current value; 0x3 - default
+        *    value
+        * 7:3 : reserved3
+        */
+       u8 flags;
+
+       /* as appears in efa_admin_aq_feature_id */
+       u8 feature_id;
+
+       /* MBZ */
+       u16 reserved16;
+};
+
+struct efa_admin_feature_device_attr_desc {
+       /* Bitmap of efa_admin_aq_feature_id */
+       u64 supported_features;
+
+       /* Bitmap of supported page sizes in MR registrations */
+       u64 page_size_cap;
+
+       u32 fw_version;
+
+       u32 admin_api_version;
+
+       u32 device_version;
+
+       /* Bar used for SQ and RQ doorbells */
+       u16 db_bar;
+
+       /* Indicates how many bits are used physical address access */
+       u8 phys_addr_width;
+
+       /* Indicates how many bits are used virtual address access */
+       u8 virt_addr_width;
+};
+
+struct efa_admin_feature_queue_attr_desc {
+       /* The maximum number of queue pairs supported */
+       u32 max_qp;
+
+       u32 max_sq_depth;
+
+       /* max send wr used in inline-buf */
+       u32 inline_buf_size;
+
+       u32 max_rq_depth;
+
+       /* The maximum number of completion queues supported per VF */
+       u32 max_cq;
+
+       u32 max_cq_depth;
+
+       /* Number of sub-CQs to be created for each CQ */
+       u16 sub_cqs_per_cq;
+
+       u16 reserved;
+
+       /*
+        * Maximum number of SGEs (buffs) allowed for a single send work
+        *    queue element (WQE)
+        */
+       u16 max_wr_send_sges;
+
+       /* Maximum number of SGEs allowed for a single recv WQE */
+       u16 max_wr_recv_sges;
+
+       /* The maximum number of memory regions supported */
+       u32 max_mr;
+
+       /* The maximum number of pages can be registered */
+       u32 max_mr_pages;
+
+       /* The maximum number of protection domains supported */
+       u32 max_pd;
+
+       /* The maximum number of address handles supported */
+       u32 max_ah;
+
+       /* The maximum size of LLQ in bytes */
+       u32 max_llq_size;
+};
+
+struct efa_admin_feature_aenq_desc {
+       /* bitmask for AENQ groups the device can report */
+       u32 supported_groups;
+
+       /* bitmask for AENQ groups to report */
+       u32 enabled_groups;
+};
+
+struct efa_admin_feature_network_attr_desc {
+       /* Raw address data in network byte order */
+       u8 addr[16];
+
+       u32 mtu;
+};
+
+/*
+ * When hint value is 0, hints capabilities are not supported or driver
+ * should use its own predefined value
+ */
+struct efa_admin_hw_hints {
+       /* value in ms */
+       u16 mmio_read_timeout;
+
+       /* value in ms */
+       u16 driver_watchdog_timeout;
+
+       /* value in ms */
+       u16 admin_completion_timeout;
+
+       /* poll interval in ms */
+       u16 poll_interval;
+};
+
+struct efa_admin_get_feature_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       struct efa_admin_ctrl_buff_info control_buffer;
+
+       struct efa_admin_get_set_feature_common_desc feature_common;
+
+       u32 raw[11];
+};
+
+struct efa_admin_get_feature_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       union {
+               u32 raw[14];
+
+               struct efa_admin_feature_device_attr_desc device_attr;
+
+               struct efa_admin_feature_aenq_desc aenq;
+
+               struct efa_admin_feature_network_attr_desc network_attr;
+
+               struct efa_admin_feature_queue_attr_desc queue_attr;
+
+               struct efa_admin_hw_hints hw_hints;
+       } u;
+};
+
+struct efa_admin_set_feature_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       struct efa_admin_ctrl_buff_info control_buffer;
+
+       struct efa_admin_get_set_feature_common_desc feature_common;
+
+       union {
+               u32 raw[11];
+
+               /* AENQ configuration */
+               struct efa_admin_feature_aenq_desc aenq;
+       } u;
+};
+
+struct efa_admin_set_feature_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       union {
+               u32 raw[14];
+       } u;
+};
+
+struct efa_admin_alloc_pd_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_pd_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /* PD number */
+       u16 pd;
+
+       /* MBZ */
+       u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       /* PD number */
+       u16 pd;
+
+       /* MBZ */
+       u16 reserved;
+};
+
+struct efa_admin_dealloc_pd_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+struct efa_admin_alloc_uar_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+};
+
+struct efa_admin_alloc_uar_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+
+       /* UAR number */
+       u16 uar;
+
+       /* MBZ */
+       u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_cmd {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       /* UAR number */
+       u16 uar;
+
+       /* MBZ */
+       u16 reserved;
+};
+
+struct efa_admin_dealloc_uar_resp {
+       struct efa_admin_acq_common_desc acq_common_desc;
+};
+
+/* asynchronous event notification groups */
+enum efa_admin_aenq_group {
+       EFA_ADMIN_FATAL_ERROR                       = 1,
+       EFA_ADMIN_WARNING                           = 2,
+       EFA_ADMIN_NOTIFICATION                      = 3,
+       EFA_ADMIN_KEEP_ALIVE                        = 4,
+       EFA_ADMIN_AENQ_GROUPS_NUM                   = 5,
+};
+
+enum efa_admin_aenq_notification_syndrom {
+       EFA_ADMIN_SUSPEND                           = 0,
+       EFA_ADMIN_RESUME                            = 1,
+       EFA_ADMIN_UPDATE_HINTS                      = 2,
+};
+
+struct efa_admin_mmio_req_read_less_resp {
+       u16 req_id;
+
+       u16 reg_off;
+
+       /* value is valid when poll is cleared */
+       u32 reg_val;
+};
+
+/* create_qp_cmd */
+#define EFA_ADMIN_CREATE_QP_CMD_SQ_VIRT_MASK                BIT(0)
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_SHIFT               1
+#define EFA_ADMIN_CREATE_QP_CMD_RQ_VIRT_MASK                BIT(1)
+
+/* reg_mr_cmd */
+#define EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK      GENMASK(4, 0)
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_SHIFT     7
+#define EFA_ADMIN_REG_MR_CMD_MEM_ADDR_PHY_MODE_EN_MASK      BIT(7)
+#define EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK        BIT(0)
+
+/* create_cq_cmd */
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_SHIFT 5
+#define EFA_ADMIN_CREATE_CQ_CMD_INTERRUPT_MODE_ENABLED_MASK BIT(5)
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_SHIFT                  6
+#define EFA_ADMIN_CREATE_CQ_CMD_VIRT_MASK                   BIT(6)
+#define EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK    GENMASK(4, 0)
+
+/* get_set_feature_common_desc */
+#define EFA_ADMIN_GET_SET_FEATURE_COMMON_DESC_SELECT_MASK   GENMASK(1, 0)
+
+#endif /* _EFA_ADMIN_CMDS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_admin_defs.h b/drivers/infiniband/hw/efa/efa_admin_defs.h
new file mode 100644 (file)
index 0000000..c8e0c8b
--- /dev/null
@@ -0,0 +1,136 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_ADMIN_H_
+#define _EFA_ADMIN_H_
+
+enum efa_admin_aq_completion_status {
+       EFA_ADMIN_SUCCESS                           = 0,
+       EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE       = 1,
+       EFA_ADMIN_BAD_OPCODE                        = 2,
+       EFA_ADMIN_UNSUPPORTED_OPCODE                = 3,
+       EFA_ADMIN_MALFORMED_REQUEST                 = 4,
+       /* Additional status is provided in ACQ entry extended_status */
+       EFA_ADMIN_ILLEGAL_PARAMETER                 = 5,
+       EFA_ADMIN_UNKNOWN_ERROR                     = 6,
+       EFA_ADMIN_RESOURCE_BUSY                     = 7,
+};
+
+struct efa_admin_aq_common_desc {
+       /*
+        * 11:0 : command_id
+        * 15:12 : reserved12
+        */
+       u16 command_id;
+
+       /* as appears in efa_admin_aq_opcode */
+       u8 opcode;
+
+       /*
+        * 0 : phase
+        * 1 : ctrl_data - control buffer address valid
+        * 2 : ctrl_data_indirect - control buffer address
+        *    points to list of pages with addresses of control
+        *    buffers
+        * 7:3 : reserved3
+        */
+       u8 flags;
+};
+
+/*
+ * used in efa_admin_aq_entry. Can point directly to control data, or to a
+ * page list chunk. Used also at the end of indirect mode page list chunks,
+ * for chaining.
+ */
+struct efa_admin_ctrl_buff_info {
+       u32 length;
+
+       struct efa_common_mem_addr address;
+};
+
+struct efa_admin_aq_entry {
+       struct efa_admin_aq_common_desc aq_common_descriptor;
+
+       union {
+               u32 inline_data_w1[3];
+
+               struct efa_admin_ctrl_buff_info control_buffer;
+       } u;
+
+       u32 inline_data_w4[12];
+};
+
+struct efa_admin_acq_common_desc {
+       /*
+        * command identifier to associate it with the aq descriptor
+        * 11:0 : command_id
+        * 15:12 : reserved12
+        */
+       u16 command;
+
+       u8 status;
+
+       /*
+        * 0 : phase
+        * 7:1 : reserved1
+        */
+       u8 flags;
+
+       u16 extended_status;
+
+       /*
+        * indicates to the driver which AQ entry has been consumed by the
+        *    device and could be reused
+        */
+       u16 sq_head_indx;
+};
+
+struct efa_admin_acq_entry {
+       struct efa_admin_acq_common_desc acq_common_descriptor;
+
+       u32 response_specific_data[14];
+};
+
+struct efa_admin_aenq_common_desc {
+       u16 group;
+
+       u16 syndrom;
+
+       /*
+        * 0 : phase
+        * 7:1 : reserved - MBZ
+        */
+       u8 flags;
+
+       u8 reserved1[3];
+
+       u32 timestamp_low;
+
+       u32 timestamp_high;
+};
+
+struct efa_admin_aenq_entry {
+       struct efa_admin_aenq_common_desc aenq_common_desc;
+
+       /* command specific inline data */
+       u32 inline_data_w4[12];
+};
+
+/* aq_common_desc */
+#define EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK            GENMASK(11, 0)
+#define EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK                 BIT(0)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_SHIFT            1
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK             BIT(1)
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_SHIFT   2
+#define EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK    BIT(2)
+
+/* acq_common_desc */
+#define EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK           GENMASK(11, 0)
+#define EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK                BIT(0)
+
+/* aenq_common_desc */
+#define EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK               BIT(0)
+
+#endif /* _EFA_ADMIN_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_com.c b/drivers/infiniband/hw/efa/efa_com.c
new file mode 100644 (file)
index 0000000..a5c7887
--- /dev/null
@@ -0,0 +1,1160 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa_com.h"
+#include "efa_regs_defs.h"
+
+#define ADMIN_CMD_TIMEOUT_US 30000000 /* usecs */
+
+#define EFA_REG_READ_TIMEOUT_US 50000 /* usecs */
+#define EFA_MMIO_READ_INVALID 0xffffffff
+
+#define EFA_POLL_INTERVAL_MS 100 /* msecs */
+
+#define EFA_ASYNC_QUEUE_DEPTH 16
+#define EFA_ADMIN_QUEUE_DEPTH 32
+
+#define MIN_EFA_VER\
+       ((EFA_ADMIN_API_VERSION_MAJOR << EFA_REGS_VERSION_MAJOR_VERSION_SHIFT) | \
+        (EFA_ADMIN_API_VERSION_MINOR & EFA_REGS_VERSION_MINOR_VERSION_MASK))
+
+#define EFA_CTRL_MAJOR          0
+#define EFA_CTRL_MINOR          0
+#define EFA_CTRL_SUB_MINOR      1
+
+#define MIN_EFA_CTRL_VER \
+       (((EFA_CTRL_MAJOR) << \
+       (EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT)) | \
+       ((EFA_CTRL_MINOR) << \
+       (EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT)) | \
+       (EFA_CTRL_SUB_MINOR))
+
+#define EFA_DMA_ADDR_TO_UINT32_LOW(x)   ((u32)((u64)(x)))
+#define EFA_DMA_ADDR_TO_UINT32_HIGH(x)  ((u32)(((u64)(x)) >> 32))
+
+#define EFA_REGS_ADMIN_INTR_MASK 1
+
+enum efa_cmd_status {
+       EFA_CMD_SUBMITTED,
+       EFA_CMD_COMPLETED,
+       /* Abort - canceled by the driver */
+       EFA_CMD_ABORTED,
+};
+
+struct efa_comp_ctx {
+       struct completion wait_event;
+       struct efa_admin_acq_entry *user_cqe;
+       u32 comp_size;
+       enum efa_cmd_status status;
+       /* status from the device */
+       u8 comp_status;
+       u8 cmd_opcode;
+       u8 occupied;
+};
+
+static const char *efa_com_cmd_str(u8 cmd)
+{
+#define EFA_CMD_STR_CASE(_cmd) case EFA_ADMIN_##_cmd: return #_cmd
+
+       switch (cmd) {
+       EFA_CMD_STR_CASE(CREATE_QP);
+       EFA_CMD_STR_CASE(MODIFY_QP);
+       EFA_CMD_STR_CASE(QUERY_QP);
+       EFA_CMD_STR_CASE(DESTROY_QP);
+       EFA_CMD_STR_CASE(CREATE_AH);
+       EFA_CMD_STR_CASE(DESTROY_AH);
+       EFA_CMD_STR_CASE(REG_MR);
+       EFA_CMD_STR_CASE(DEREG_MR);
+       EFA_CMD_STR_CASE(CREATE_CQ);
+       EFA_CMD_STR_CASE(DESTROY_CQ);
+       EFA_CMD_STR_CASE(GET_FEATURE);
+       EFA_CMD_STR_CASE(SET_FEATURE);
+       EFA_CMD_STR_CASE(GET_STATS);
+       EFA_CMD_STR_CASE(ALLOC_PD);
+       EFA_CMD_STR_CASE(DEALLOC_PD);
+       EFA_CMD_STR_CASE(ALLOC_UAR);
+       EFA_CMD_STR_CASE(DEALLOC_UAR);
+       default: return "unknown command opcode";
+       }
+#undef EFA_CMD_STR_CASE
+}
+
+static u32 efa_com_reg_read32(struct efa_com_dev *edev, u16 offset)
+{
+       struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+       struct efa_admin_mmio_req_read_less_resp *read_resp;
+       unsigned long exp_time;
+       u32 mmio_read_reg;
+       u32 err;
+
+       read_resp = mmio_read->read_resp;
+
+       spin_lock(&mmio_read->lock);
+       mmio_read->seq_num++;
+
+       /* trash DMA req_id to identify when hardware is done */
+       read_resp->req_id = mmio_read->seq_num + 0x9aL;
+       mmio_read_reg = (offset << EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT) &
+                       EFA_REGS_MMIO_REG_READ_REG_OFF_MASK;
+       mmio_read_reg |= mmio_read->seq_num &
+                        EFA_REGS_MMIO_REG_READ_REQ_ID_MASK;
+
+       writel(mmio_read_reg, edev->reg_bar + EFA_REGS_MMIO_REG_READ_OFF);
+
+       exp_time = jiffies + usecs_to_jiffies(mmio_read->mmio_read_timeout);
+       do {
+               if (READ_ONCE(read_resp->req_id) == mmio_read->seq_num)
+                       break;
+               udelay(1);
+       } while (time_is_after_jiffies(exp_time));
+
+       if (read_resp->req_id != mmio_read->seq_num) {
+               ibdev_err(edev->efa_dev,
+                         "Reading register timed out. expected: req id[%u] offset[%#x] actual: req id[%u] offset[%#x]\n",
+                         mmio_read->seq_num, offset, read_resp->req_id,
+                         read_resp->reg_off);
+               err = EFA_MMIO_READ_INVALID;
+               goto out;
+       }
+
+       if (read_resp->reg_off != offset) {
+               ibdev_err(edev->efa_dev,
+                         "Reading register failed: wrong offset provided\n");
+               err = EFA_MMIO_READ_INVALID;
+               goto out;
+       }
+
+       err = read_resp->reg_val;
+out:
+       spin_unlock(&mmio_read->lock);
+       return err;
+}
+
+static int efa_com_admin_init_sq(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_com_admin_sq *sq = &aq->sq;
+       u16 size = aq->depth * sizeof(*sq->entries);
+       u32 addr_high;
+       u32 addr_low;
+       u32 aq_caps;
+
+       sq->entries =
+               dma_alloc_coherent(aq->dmadev, size, &sq->dma_addr, GFP_KERNEL);
+       if (!sq->entries)
+               return -ENOMEM;
+
+       spin_lock_init(&sq->lock);
+
+       sq->cc = 0;
+       sq->pc = 0;
+       sq->phase = 1;
+
+       sq->db_addr = (u32 __iomem *)(edev->reg_bar + EFA_REGS_AQ_PROD_DB_OFF);
+
+       addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(sq->dma_addr);
+       addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(sq->dma_addr);
+
+       writel(addr_low, edev->reg_bar + EFA_REGS_AQ_BASE_LO_OFF);
+       writel(addr_high, edev->reg_bar + EFA_REGS_AQ_BASE_HI_OFF);
+
+       aq_caps = aq->depth & EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK;
+       aq_caps |= (sizeof(struct efa_admin_aq_entry) <<
+                       EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT) &
+                       EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK;
+
+       writel(aq_caps, edev->reg_bar + EFA_REGS_AQ_CAPS_OFF);
+
+       return 0;
+}
+
+static int efa_com_admin_init_cq(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_com_admin_cq *cq = &aq->cq;
+       u16 size = aq->depth * sizeof(*cq->entries);
+       u32 addr_high;
+       u32 addr_low;
+       u32 acq_caps;
+
+       cq->entries =
+               dma_alloc_coherent(aq->dmadev, size, &cq->dma_addr, GFP_KERNEL);
+       if (!cq->entries)
+               return -ENOMEM;
+
+       spin_lock_init(&cq->lock);
+
+       cq->cc = 0;
+       cq->phase = 1;
+
+       addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(cq->dma_addr);
+       addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(cq->dma_addr);
+
+       writel(addr_low, edev->reg_bar + EFA_REGS_ACQ_BASE_LO_OFF);
+       writel(addr_high, edev->reg_bar + EFA_REGS_ACQ_BASE_HI_OFF);
+
+       acq_caps = aq->depth & EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK;
+       acq_caps |= (sizeof(struct efa_admin_acq_entry) <<
+                       EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT) &
+                       EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK;
+       acq_caps |= (aq->msix_vector_idx <<
+                       EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT) &
+                       EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK;
+
+       writel(acq_caps, edev->reg_bar + EFA_REGS_ACQ_CAPS_OFF);
+
+       return 0;
+}
+
+static int efa_com_admin_init_aenq(struct efa_com_dev *edev,
+                                  struct efa_aenq_handlers *aenq_handlers)
+{
+       struct efa_com_aenq *aenq = &edev->aenq;
+       u32 addr_low, addr_high, aenq_caps;
+       u16 size;
+
+       if (!aenq_handlers) {
+               ibdev_err(edev->efa_dev, "aenq handlers pointer is NULL\n");
+               return -EINVAL;
+       }
+
+       size = EFA_ASYNC_QUEUE_DEPTH * sizeof(*aenq->entries);
+       aenq->entries = dma_alloc_coherent(edev->dmadev, size, &aenq->dma_addr,
+                                          GFP_KERNEL);
+       if (!aenq->entries)
+               return -ENOMEM;
+
+       aenq->aenq_handlers = aenq_handlers;
+       aenq->depth = EFA_ASYNC_QUEUE_DEPTH;
+       aenq->cc = 0;
+       aenq->phase = 1;
+
+       addr_low = EFA_DMA_ADDR_TO_UINT32_LOW(aenq->dma_addr);
+       addr_high = EFA_DMA_ADDR_TO_UINT32_HIGH(aenq->dma_addr);
+
+       writel(addr_low, edev->reg_bar + EFA_REGS_AENQ_BASE_LO_OFF);
+       writel(addr_high, edev->reg_bar + EFA_REGS_AENQ_BASE_HI_OFF);
+
+       aenq_caps = aenq->depth & EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK;
+       aenq_caps |= (sizeof(struct efa_admin_aenq_entry) <<
+               EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT) &
+               EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK;
+       aenq_caps |= (aenq->msix_vector_idx
+                     << EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT) &
+                    EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK;
+       writel(aenq_caps, edev->reg_bar + EFA_REGS_AENQ_CAPS_OFF);
+
+       /*
+        * Init cons_db to mark that all entries in the queue
+        * are initially available
+        */
+       writel(edev->aenq.cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+
+       return 0;
+}
+
+/* ID to be used with efa_com_get_comp_ctx */
+static u16 efa_com_alloc_ctx_id(struct efa_com_admin_queue *aq)
+{
+       u16 ctx_id;
+
+       spin_lock(&aq->comp_ctx_lock);
+       ctx_id = aq->comp_ctx_pool[aq->comp_ctx_pool_next];
+       aq->comp_ctx_pool_next++;
+       spin_unlock(&aq->comp_ctx_lock);
+
+       return ctx_id;
+}
+
+static void efa_com_dealloc_ctx_id(struct efa_com_admin_queue *aq,
+                                  u16 ctx_id)
+{
+       spin_lock(&aq->comp_ctx_lock);
+       aq->comp_ctx_pool_next--;
+       aq->comp_ctx_pool[aq->comp_ctx_pool_next] = ctx_id;
+       spin_unlock(&aq->comp_ctx_lock);
+}
+
+static inline void efa_com_put_comp_ctx(struct efa_com_admin_queue *aq,
+                                       struct efa_comp_ctx *comp_ctx)
+{
+       u16 comp_id = comp_ctx->user_cqe->acq_common_descriptor.command &
+                     EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+
+       ibdev_dbg(aq->efa_dev, "Putting completion command_id %d\n", comp_id);
+       comp_ctx->occupied = 0;
+       efa_com_dealloc_ctx_id(aq, comp_id);
+}
+
+static struct efa_comp_ctx *efa_com_get_comp_ctx(struct efa_com_admin_queue *aq,
+                                                u16 command_id, bool capture)
+{
+       if (command_id >= aq->depth) {
+               ibdev_err(aq->efa_dev,
+                         "command id is larger than the queue size. cmd_id: %u queue size %d\n",
+                         command_id, aq->depth);
+               return NULL;
+       }
+
+       if (aq->comp_ctx[command_id].occupied && capture) {
+               ibdev_err(aq->efa_dev, "Completion context is occupied\n");
+               return NULL;
+       }
+
+       if (capture) {
+               aq->comp_ctx[command_id].occupied = 1;
+               ibdev_dbg(aq->efa_dev, "Taking completion ctxt command_id %d\n",
+                         command_id);
+       }
+
+       return &aq->comp_ctx[command_id];
+}
+
+static struct efa_comp_ctx *__efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+                                                      struct efa_admin_aq_entry *cmd,
+                                                      size_t cmd_size_in_bytes,
+                                                      struct efa_admin_acq_entry *comp,
+                                                      size_t comp_size_in_bytes)
+{
+       struct efa_comp_ctx *comp_ctx;
+       u16 queue_size_mask;
+       u16 ctx_id;
+       u16 pi;
+
+       queue_size_mask = aq->depth - 1;
+       pi = aq->sq.pc & queue_size_mask;
+
+       ctx_id = efa_com_alloc_ctx_id(aq);
+
+       cmd->aq_common_descriptor.flags |= aq->sq.phase &
+               EFA_ADMIN_AQ_COMMON_DESC_PHASE_MASK;
+
+       cmd->aq_common_descriptor.command_id |= ctx_id &
+               EFA_ADMIN_AQ_COMMON_DESC_COMMAND_ID_MASK;
+
+       comp_ctx = efa_com_get_comp_ctx(aq, ctx_id, true);
+       if (!comp_ctx) {
+               efa_com_dealloc_ctx_id(aq, ctx_id);
+               return ERR_PTR(-EINVAL);
+       }
+
+       comp_ctx->status = EFA_CMD_SUBMITTED;
+       comp_ctx->comp_size = comp_size_in_bytes;
+       comp_ctx->user_cqe = comp;
+       comp_ctx->cmd_opcode = cmd->aq_common_descriptor.opcode;
+
+       reinit_completion(&comp_ctx->wait_event);
+
+       memcpy(&aq->sq.entries[pi], cmd, cmd_size_in_bytes);
+
+       aq->sq.pc++;
+       atomic64_inc(&aq->stats.submitted_cmd);
+
+       if ((aq->sq.pc & queue_size_mask) == 0)
+               aq->sq.phase = !aq->sq.phase;
+
+       /* barrier not needed in case of writel */
+       writel(aq->sq.pc, aq->sq.db_addr);
+
+       return comp_ctx;
+}
+
+static inline int efa_com_init_comp_ctxt(struct efa_com_admin_queue *aq)
+{
+       size_t pool_size = aq->depth * sizeof(*aq->comp_ctx_pool);
+       size_t size = aq->depth * sizeof(struct efa_comp_ctx);
+       struct efa_comp_ctx *comp_ctx;
+       u16 i;
+
+       aq->comp_ctx = devm_kzalloc(aq->dmadev, size, GFP_KERNEL);
+       aq->comp_ctx_pool = devm_kzalloc(aq->dmadev, pool_size, GFP_KERNEL);
+       if (!aq->comp_ctx || !aq->comp_ctx_pool) {
+               devm_kfree(aq->dmadev, aq->comp_ctx_pool);
+               devm_kfree(aq->dmadev, aq->comp_ctx);
+               return -ENOMEM;
+       }
+
+       for (i = 0; i < aq->depth; i++) {
+               comp_ctx = efa_com_get_comp_ctx(aq, i, false);
+               if (comp_ctx)
+                       init_completion(&comp_ctx->wait_event);
+
+               aq->comp_ctx_pool[i] = i;
+       }
+
+       spin_lock_init(&aq->comp_ctx_lock);
+
+       aq->comp_ctx_pool_next = 0;
+
+       return 0;
+}
+
+static struct efa_comp_ctx *efa_com_submit_admin_cmd(struct efa_com_admin_queue *aq,
+                                                    struct efa_admin_aq_entry *cmd,
+                                                    size_t cmd_size_in_bytes,
+                                                    struct efa_admin_acq_entry *comp,
+                                                    size_t comp_size_in_bytes)
+{
+       struct efa_comp_ctx *comp_ctx;
+
+       spin_lock(&aq->sq.lock);
+       if (!test_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state)) {
+               ibdev_err(aq->efa_dev, "Admin queue is closed\n");
+               spin_unlock(&aq->sq.lock);
+               return ERR_PTR(-ENODEV);
+       }
+
+       comp_ctx = __efa_com_submit_admin_cmd(aq, cmd, cmd_size_in_bytes, comp,
+                                             comp_size_in_bytes);
+       spin_unlock(&aq->sq.lock);
+       if (IS_ERR(comp_ctx))
+               clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+       return comp_ctx;
+}
+
+static void efa_com_handle_single_admin_completion(struct efa_com_admin_queue *aq,
+                                                  struct efa_admin_acq_entry *cqe)
+{
+       struct efa_comp_ctx *comp_ctx;
+       u16 cmd_id;
+
+       cmd_id = cqe->acq_common_descriptor.command &
+                EFA_ADMIN_ACQ_COMMON_DESC_COMMAND_ID_MASK;
+
+       comp_ctx = efa_com_get_comp_ctx(aq, cmd_id, false);
+       if (!comp_ctx) {
+               ibdev_err(aq->efa_dev,
+                         "comp_ctx is NULL. Changing the admin queue running state\n");
+               clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+               return;
+       }
+
+       comp_ctx->status = EFA_CMD_COMPLETED;
+       comp_ctx->comp_status = cqe->acq_common_descriptor.status;
+       if (comp_ctx->user_cqe)
+               memcpy(comp_ctx->user_cqe, cqe, comp_ctx->comp_size);
+
+       if (!test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+               complete(&comp_ctx->wait_event);
+}
+
+static void efa_com_handle_admin_completion(struct efa_com_admin_queue *aq)
+{
+       struct efa_admin_acq_entry *cqe;
+       u16 queue_size_mask;
+       u16 comp_num = 0;
+       u8 phase;
+       u16 ci;
+
+       queue_size_mask = aq->depth - 1;
+
+       ci = aq->cq.cc & queue_size_mask;
+       phase = aq->cq.phase;
+
+       cqe = &aq->cq.entries[ci];
+
+       /* Go over all the completions */
+       while ((READ_ONCE(cqe->acq_common_descriptor.flags) &
+               EFA_ADMIN_ACQ_COMMON_DESC_PHASE_MASK) == phase) {
+               /*
+                * Do not read the rest of the completion entry before the
+                * phase bit was validated
+                */
+               dma_rmb();
+               efa_com_handle_single_admin_completion(aq, cqe);
+
+               ci++;
+               comp_num++;
+               if (ci == aq->depth) {
+                       ci = 0;
+                       phase = !phase;
+               }
+
+               cqe = &aq->cq.entries[ci];
+       }
+
+       aq->cq.cc += comp_num;
+       aq->cq.phase = phase;
+       aq->sq.cc += comp_num;
+       atomic64_add(comp_num, &aq->stats.completed_cmd);
+}
+
+static int efa_com_comp_status_to_errno(u8 comp_status)
+{
+       switch (comp_status) {
+       case EFA_ADMIN_SUCCESS:
+               return 0;
+       case EFA_ADMIN_RESOURCE_ALLOCATION_FAILURE:
+               return -ENOMEM;
+       case EFA_ADMIN_UNSUPPORTED_OPCODE:
+               return -EOPNOTSUPP;
+       case EFA_ADMIN_BAD_OPCODE:
+       case EFA_ADMIN_MALFORMED_REQUEST:
+       case EFA_ADMIN_ILLEGAL_PARAMETER:
+       case EFA_ADMIN_UNKNOWN_ERROR:
+               return -EINVAL;
+       default:
+               return -EINVAL;
+       }
+}
+
+static int efa_com_wait_and_process_admin_cq_polling(struct efa_comp_ctx *comp_ctx,
+                                                    struct efa_com_admin_queue *aq)
+{
+       unsigned long timeout;
+       unsigned long flags;
+       int err;
+
+       timeout = jiffies + usecs_to_jiffies(aq->completion_timeout);
+
+       while (1) {
+               spin_lock_irqsave(&aq->cq.lock, flags);
+               efa_com_handle_admin_completion(aq);
+               spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+               if (comp_ctx->status != EFA_CMD_SUBMITTED)
+                       break;
+
+               if (time_is_before_jiffies(timeout)) {
+                       ibdev_err(aq->efa_dev,
+                                 "Wait for completion (polling) timeout\n");
+                       /* EFA didn't have any completion */
+                       atomic64_inc(&aq->stats.no_completion);
+
+                       clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+                       err = -ETIME;
+                       goto out;
+               }
+
+               msleep(aq->poll_interval);
+       }
+
+       if (comp_ctx->status == EFA_CMD_ABORTED) {
+               ibdev_err(aq->efa_dev, "Command was aborted\n");
+               atomic64_inc(&aq->stats.aborted_cmd);
+               err = -ENODEV;
+               goto out;
+       }
+
+       WARN_ONCE(comp_ctx->status != EFA_CMD_COMPLETED,
+                 "Invalid completion status %d\n", comp_ctx->status);
+
+       err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+       efa_com_put_comp_ctx(aq, comp_ctx);
+       return err;
+}
+
+static int efa_com_wait_and_process_admin_cq_interrupts(struct efa_comp_ctx *comp_ctx,
+                                                       struct efa_com_admin_queue *aq)
+{
+       unsigned long flags;
+       int err;
+
+       wait_for_completion_timeout(&comp_ctx->wait_event,
+                                   usecs_to_jiffies(aq->completion_timeout));
+
+       /*
+        * In case the command wasn't completed find out the root cause.
+        * There might be 2 kinds of errors
+        * 1) No completion (timeout reached)
+        * 2) There is completion but the device didn't get any msi-x interrupt.
+        */
+       if (comp_ctx->status == EFA_CMD_SUBMITTED) {
+               spin_lock_irqsave(&aq->cq.lock, flags);
+               efa_com_handle_admin_completion(aq);
+               spin_unlock_irqrestore(&aq->cq.lock, flags);
+
+               atomic64_inc(&aq->stats.no_completion);
+
+               if (comp_ctx->status == EFA_CMD_COMPLETED)
+                       ibdev_err(aq->efa_dev,
+                                 "The device sent a completion but the driver didn't receive any MSI-X interrupt for admin cmd %s(%d) status %d (ctx: 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+                                 efa_com_cmd_str(comp_ctx->cmd_opcode),
+                                 comp_ctx->cmd_opcode, comp_ctx->status,
+                                 comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+               else
+                       ibdev_err(aq->efa_dev,
+                                 "The device didn't send any completion for admin cmd %s(%d) status %d (ctx 0x%p, sq producer: %d, sq consumer: %d, cq consumer: %d)\n",
+                                 efa_com_cmd_str(comp_ctx->cmd_opcode),
+                                 comp_ctx->cmd_opcode, comp_ctx->status,
+                                 comp_ctx, aq->sq.pc, aq->sq.cc, aq->cq.cc);
+
+               clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+               err = -ETIME;
+               goto out;
+       }
+
+       err = efa_com_comp_status_to_errno(comp_ctx->comp_status);
+out:
+       efa_com_put_comp_ctx(aq, comp_ctx);
+       return err;
+}
+
+/*
+ * There are two types to wait for completion.
+ * Polling mode - wait until the completion is available.
+ * Async mode - wait on wait queue until the completion is ready
+ * (or the timeout expired).
+ * It is expected that the IRQ called efa_com_handle_admin_completion
+ * to mark the completions.
+ */
+static int efa_com_wait_and_process_admin_cq(struct efa_comp_ctx *comp_ctx,
+                                            struct efa_com_admin_queue *aq)
+{
+       if (test_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state))
+               return efa_com_wait_and_process_admin_cq_polling(comp_ctx, aq);
+
+       return efa_com_wait_and_process_admin_cq_interrupts(comp_ctx, aq);
+}
+
+/**
+ * efa_com_cmd_exec - Execute admin command
+ * @aq: admin queue.
+ * @cmd: the admin command to execute.
+ * @cmd_size: the command size.
+ * @comp: command completion return entry.
+ * @comp_size: command completion size.
+ * Submit an admin command and then wait until the device will return a
+ * completion.
+ * The completion will be copied into comp.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+                    struct efa_admin_aq_entry *cmd,
+                    size_t cmd_size,
+                    struct efa_admin_acq_entry *comp,
+                    size_t comp_size)
+{
+       struct efa_comp_ctx *comp_ctx;
+       int err;
+
+       might_sleep();
+
+       /* In case of queue FULL */
+       down(&aq->avail_cmds);
+
+       ibdev_dbg(aq->efa_dev, "%s (opcode %d)\n",
+                 efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+                 cmd->aq_common_descriptor.opcode);
+       comp_ctx = efa_com_submit_admin_cmd(aq, cmd, cmd_size, comp, comp_size);
+       if (IS_ERR(comp_ctx)) {
+               ibdev_err(aq->efa_dev,
+                         "Failed to submit command %s (opcode %u) err %ld\n",
+                         efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+                         cmd->aq_common_descriptor.opcode, PTR_ERR(comp_ctx));
+
+               up(&aq->avail_cmds);
+               return PTR_ERR(comp_ctx);
+       }
+
+       err = efa_com_wait_and_process_admin_cq(comp_ctx, aq);
+       if (err)
+               ibdev_err(aq->efa_dev,
+                         "Failed to process command %s (opcode %u) comp_status %d err %d\n",
+                         efa_com_cmd_str(cmd->aq_common_descriptor.opcode),
+                         cmd->aq_common_descriptor.opcode,
+                         comp_ctx->comp_status, err);
+
+       up(&aq->avail_cmds);
+
+       return err;
+}
+
+/**
+ * efa_com_abort_admin_commands - Abort all the outstanding admin commands.
+ * @edev: EFA communication layer struct
+ *
+ * This method aborts all the outstanding admin commands.
+ * The caller should then call efa_com_wait_for_abort_completion to make sure
+ * all the commands were completed.
+ */
+static void efa_com_abort_admin_commands(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_comp_ctx *comp_ctx;
+       unsigned long flags;
+       u16 i;
+
+       spin_lock(&aq->sq.lock);
+       spin_lock_irqsave(&aq->cq.lock, flags);
+       for (i = 0; i < aq->depth; i++) {
+               comp_ctx = efa_com_get_comp_ctx(aq, i, false);
+               if (!comp_ctx)
+                       break;
+
+               comp_ctx->status = EFA_CMD_ABORTED;
+
+               complete(&comp_ctx->wait_event);
+       }
+       spin_unlock_irqrestore(&aq->cq.lock, flags);
+       spin_unlock(&aq->sq.lock);
+}
+
+/**
+ * efa_com_wait_for_abort_completion - Wait for admin commands abort.
+ * @edev: EFA communication layer struct
+ *
+ * This method wait until all the outstanding admin commands will be completed.
+ */
+static void efa_com_wait_for_abort_completion(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int i;
+
+       /* all mine */
+       for (i = 0; i < aq->depth; i++)
+               down(&aq->avail_cmds);
+
+       /* let it go */
+       for (i = 0; i < aq->depth; i++)
+               up(&aq->avail_cmds);
+}
+
+static void efa_com_admin_flush(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+
+       clear_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+       efa_com_abort_admin_commands(edev);
+       efa_com_wait_for_abort_completion(edev);
+}
+
+/**
+ * efa_com_admin_destroy - Destroy the admin and the async events queues.
+ * @edev: EFA communication layer struct
+ */
+void efa_com_admin_destroy(struct efa_com_dev *edev)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_com_aenq *aenq = &edev->aenq;
+       struct efa_com_admin_cq *cq = &aq->cq;
+       struct efa_com_admin_sq *sq = &aq->sq;
+       u16 size;
+
+       efa_com_admin_flush(edev);
+
+       devm_kfree(edev->dmadev, aq->comp_ctx_pool);
+       devm_kfree(edev->dmadev, aq->comp_ctx);
+
+       size = aq->depth * sizeof(*sq->entries);
+       dma_free_coherent(edev->dmadev, size, sq->entries, sq->dma_addr);
+
+       size = aq->depth * sizeof(*cq->entries);
+       dma_free_coherent(edev->dmadev, size, cq->entries, cq->dma_addr);
+
+       size = aenq->depth * sizeof(*aenq->entries);
+       dma_free_coherent(edev->dmadev, size, aenq->entries, aenq->dma_addr);
+}
+
+/**
+ * efa_com_set_admin_polling_mode - Set the admin completion queue polling mode
+ * @edev: EFA communication layer struct
+ * @polling: Enable/Disable polling mode
+ *
+ * Set the admin completion mode.
+ */
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling)
+{
+       u32 mask_value = 0;
+
+       if (polling)
+               mask_value = EFA_REGS_ADMIN_INTR_MASK;
+
+       writel(mask_value, edev->reg_bar + EFA_REGS_INTR_MASK_OFF);
+       if (polling)
+               set_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+       else
+               clear_bit(EFA_AQ_STATE_POLLING_BIT, &edev->aq.state);
+}
+
+static void efa_com_stats_init(struct efa_com_dev *edev)
+{
+       atomic64_t *s = (atomic64_t *)&edev->aq.stats;
+       int i;
+
+       for (i = 0; i < sizeof(edev->aq.stats) / sizeof(*s); i++, s++)
+               atomic64_set(s, 0);
+}
+
+/**
+ * efa_com_admin_init - Init the admin and the async queues
+ * @edev: EFA communication layer struct
+ * @aenq_handlers: Those handlers to be called upon event.
+ *
+ * Initialize the admin submission and completion queues.
+ * Initialize the asynchronous events notification queues.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_admin_init(struct efa_com_dev *edev,
+                      struct efa_aenq_handlers *aenq_handlers)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       u32 timeout;
+       u32 dev_sts;
+       u32 cap;
+       int err;
+
+       dev_sts = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+       if (!(dev_sts & EFA_REGS_DEV_STS_READY_MASK)) {
+               ibdev_err(edev->efa_dev,
+                         "Device isn't ready, abort com init %#x\n", dev_sts);
+               return -ENODEV;
+       }
+
+       aq->depth = EFA_ADMIN_QUEUE_DEPTH;
+
+       aq->dmadev = edev->dmadev;
+       aq->efa_dev = edev->efa_dev;
+       set_bit(EFA_AQ_STATE_POLLING_BIT, &aq->state);
+
+       sema_init(&aq->avail_cmds, aq->depth);
+
+       efa_com_stats_init(edev);
+
+       err = efa_com_init_comp_ctxt(aq);
+       if (err)
+               return err;
+
+       err = efa_com_admin_init_sq(edev);
+       if (err)
+               goto err_destroy_comp_ctxt;
+
+       err = efa_com_admin_init_cq(edev);
+       if (err)
+               goto err_destroy_sq;
+
+       efa_com_set_admin_polling_mode(edev, false);
+
+       err = efa_com_admin_init_aenq(edev, aenq_handlers);
+       if (err)
+               goto err_destroy_cq;
+
+       cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+       timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+                 EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+       if (timeout)
+               /* the resolution of timeout reg is 100ms */
+               aq->completion_timeout = timeout * 100000;
+       else
+               aq->completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+       aq->poll_interval = EFA_POLL_INTERVAL_MS;
+
+       set_bit(EFA_AQ_STATE_RUNNING_BIT, &aq->state);
+
+       return 0;
+
+err_destroy_cq:
+       dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->cq.entries),
+                         aq->cq.entries, aq->cq.dma_addr);
+err_destroy_sq:
+       dma_free_coherent(edev->dmadev, aq->depth * sizeof(*aq->sq.entries),
+                         aq->sq.entries, aq->sq.dma_addr);
+err_destroy_comp_ctxt:
+       devm_kfree(edev->dmadev, aq->comp_ctx);
+
+       return err;
+}
+
+/**
+ * efa_com_admin_q_comp_intr_handler - admin queue interrupt handler
+ * @edev: EFA communication layer struct
+ *
+ * This method goes over the admin completion queue and wakes up
+ * all the pending threads that wait on the commands wait event.
+ *
+ * @note: Should be called after MSI-X interrupt.
+ */
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&edev->aq.cq.lock, flags);
+       efa_com_handle_admin_completion(&edev->aq);
+       spin_unlock_irqrestore(&edev->aq.cq.lock, flags);
+}
+
+/*
+ * efa_handle_specific_aenq_event:
+ * return the handler that is relevant to the specific event group
+ */
+static efa_aenq_handler efa_com_get_specific_aenq_cb(struct efa_com_dev *edev,
+                                                    u16 group)
+{
+       struct efa_aenq_handlers *aenq_handlers = edev->aenq.aenq_handlers;
+
+       if (group < EFA_MAX_HANDLERS && aenq_handlers->handlers[group])
+               return aenq_handlers->handlers[group];
+
+       return aenq_handlers->unimplemented_handler;
+}
+
+/**
+ * efa_com_aenq_intr_handler - AENQ interrupt handler
+ * @edev: EFA communication layer struct
+ * @data: Data of interrupt handler.
+ *
+ * Go over the async event notification queue and call the proper aenq handler.
+ */
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data)
+{
+       struct efa_admin_aenq_common_desc *aenq_common;
+       struct efa_com_aenq *aenq = &edev->aenq;
+       struct efa_admin_aenq_entry *aenq_e;
+       efa_aenq_handler handler_cb;
+       u32 processed = 0;
+       u8 phase;
+       u32 ci;
+
+       ci = aenq->cc & (aenq->depth - 1);
+       phase = aenq->phase;
+       aenq_e = &aenq->entries[ci]; /* Get first entry */
+       aenq_common = &aenq_e->aenq_common_desc;
+
+       /* Go over all the events */
+       while ((READ_ONCE(aenq_common->flags) &
+               EFA_ADMIN_AENQ_COMMON_DESC_PHASE_MASK) == phase) {
+               /*
+                * Do not read the rest of the completion entry before the
+                * phase bit was validated
+                */
+               dma_rmb();
+
+               /* Handle specific event*/
+               handler_cb = efa_com_get_specific_aenq_cb(edev,
+                                                         aenq_common->group);
+               handler_cb(data, aenq_e); /* call the actual event handler*/
+
+               /* Get next event entry */
+               ci++;
+               processed++;
+
+               if (ci == aenq->depth) {
+                       ci = 0;
+                       phase = !phase;
+               }
+               aenq_e = &aenq->entries[ci];
+               aenq_common = &aenq_e->aenq_common_desc;
+       }
+
+       aenq->cc += processed;
+       aenq->phase = phase;
+
+       /* Don't update aenq doorbell if there weren't any processed events */
+       if (!processed)
+               return;
+
+       /* barrier not needed in case of writel */
+       writel(aenq->cc, edev->reg_bar + EFA_REGS_AENQ_CONS_DB_OFF);
+}
+
+static void efa_com_mmio_reg_read_resp_addr_init(struct efa_com_dev *edev)
+{
+       struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+       u32 addr_high;
+       u32 addr_low;
+
+       /* dma_addr_bits is unknown at this point */
+       addr_high = (mmio_read->read_resp_dma_addr >> 32) & GENMASK(31, 0);
+       addr_low = mmio_read->read_resp_dma_addr & GENMASK(31, 0);
+
+       writel(addr_high, edev->reg_bar + EFA_REGS_MMIO_RESP_HI_OFF);
+       writel(addr_low, edev->reg_bar + EFA_REGS_MMIO_RESP_LO_OFF);
+}
+
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev)
+{
+       struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+       spin_lock_init(&mmio_read->lock);
+       mmio_read->read_resp =
+               dma_alloc_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+                                  &mmio_read->read_resp_dma_addr, GFP_KERNEL);
+       if (!mmio_read->read_resp)
+               return -ENOMEM;
+
+       efa_com_mmio_reg_read_resp_addr_init(edev);
+
+       mmio_read->read_resp->req_id = 0;
+       mmio_read->seq_num = 0;
+       mmio_read->mmio_read_timeout = EFA_REG_READ_TIMEOUT_US;
+
+       return 0;
+}
+
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev)
+{
+       struct efa_com_mmio_read *mmio_read = &edev->mmio_read;
+
+       dma_free_coherent(edev->dmadev, sizeof(*mmio_read->read_resp),
+                         mmio_read->read_resp, mmio_read->read_resp_dma_addr);
+}
+
+int efa_com_validate_version(struct efa_com_dev *edev)
+{
+       u32 ctrl_ver_masked;
+       u32 ctrl_ver;
+       u32 ver;
+
+       /*
+        * Make sure the EFA version and the controller version are at least
+        * as the driver expects
+        */
+       ver = efa_com_reg_read32(edev, EFA_REGS_VERSION_OFF);
+       ctrl_ver = efa_com_reg_read32(edev,
+                                     EFA_REGS_CONTROLLER_VERSION_OFF);
+
+       ibdev_dbg(edev->efa_dev, "efa device version: %d.%d\n",
+                 (ver & EFA_REGS_VERSION_MAJOR_VERSION_MASK) >>
+                         EFA_REGS_VERSION_MAJOR_VERSION_SHIFT,
+                 ver & EFA_REGS_VERSION_MINOR_VERSION_MASK);
+
+       if (ver < MIN_EFA_VER) {
+               ibdev_err(edev->efa_dev,
+                         "EFA version is lower than the minimal version the driver supports\n");
+               return -EOPNOTSUPP;
+       }
+
+       ibdev_dbg(edev->efa_dev,
+                 "efa controller version: %d.%d.%d implementation version %d\n",
+                 (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) >>
+                         EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT,
+                 (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) >>
+                         EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT,
+                 (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK),
+                 (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK) >>
+                         EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT);
+
+       ctrl_ver_masked =
+               (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK) |
+               (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK) |
+               (ctrl_ver & EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK);
+
+       /* Validate the ctrl version without the implementation ID */
+       if (ctrl_ver_masked < MIN_EFA_CTRL_VER) {
+               ibdev_err(edev->efa_dev,
+                         "EFA ctrl version is lower than the minimal ctrl version the driver supports\n");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+/**
+ * efa_com_get_dma_width - Retrieve physical dma address width the device
+ * supports.
+ * @edev: EFA communication layer struct
+ *
+ * Retrieve the maximum physical address bits the device can handle.
+ *
+ * @return: > 0 on Success and negative value otherwise.
+ */
+int efa_com_get_dma_width(struct efa_com_dev *edev)
+{
+       u32 caps = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+       int width;
+
+       width = (caps & EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK) >>
+               EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT;
+
+       ibdev_dbg(edev->efa_dev, "DMA width: %d\n", width);
+
+       if (width < 32 || width > 64) {
+               ibdev_err(edev->efa_dev, "DMA width illegal value: %d\n", width);
+               return -EINVAL;
+       }
+
+       edev->dma_addr_bits = width;
+
+       return width;
+}
+
+static int wait_for_reset_state(struct efa_com_dev *edev, u32 timeout,
+                               u16 exp_state)
+{
+       u32 val, i;
+
+       for (i = 0; i < timeout; i++) {
+               val = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+
+               if ((val & EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK) ==
+                   exp_state)
+                       return 0;
+
+               ibdev_dbg(edev->efa_dev, "Reset indication val %d\n", val);
+               msleep(EFA_POLL_INTERVAL_MS);
+       }
+
+       return -ETIME;
+}
+
+/**
+ * efa_com_dev_reset - Perform device FLR to the device.
+ * @edev: EFA communication layer struct
+ * @reset_reason: Specify what is the trigger for the reset in case of an error.
+ *
+ * @return - 0 on success, negative value on failure.
+ */
+int efa_com_dev_reset(struct efa_com_dev *edev,
+                     enum efa_regs_reset_reason_types reset_reason)
+{
+       u32 stat, timeout, cap, reset_val;
+       int err;
+
+       stat = efa_com_reg_read32(edev, EFA_REGS_DEV_STS_OFF);
+       cap = efa_com_reg_read32(edev, EFA_REGS_CAPS_OFF);
+
+       if (!(stat & EFA_REGS_DEV_STS_READY_MASK)) {
+               ibdev_err(edev->efa_dev,
+                         "Device isn't ready, can't reset device\n");
+               return -EINVAL;
+       }
+
+       timeout = (cap & EFA_REGS_CAPS_RESET_TIMEOUT_MASK) >>
+                 EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT;
+       if (!timeout) {
+               ibdev_err(edev->efa_dev, "Invalid timeout value\n");
+               return -EINVAL;
+       }
+
+       /* start reset */
+       reset_val = EFA_REGS_DEV_CTL_DEV_RESET_MASK;
+       reset_val |= (reset_reason << EFA_REGS_DEV_CTL_RESET_REASON_SHIFT) &
+                    EFA_REGS_DEV_CTL_RESET_REASON_MASK;
+       writel(reset_val, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+
+       /* reset clears the mmio readless address, restore it */
+       efa_com_mmio_reg_read_resp_addr_init(edev);
+
+       err = wait_for_reset_state(edev, timeout,
+                                  EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Reset indication didn't turn on\n");
+               return err;
+       }
+
+       /* reset done */
+       writel(0, edev->reg_bar + EFA_REGS_DEV_CTL_OFF);
+       err = wait_for_reset_state(edev, timeout, 0);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Reset indication didn't turn off\n");
+               return err;
+       }
+
+       timeout = (cap & EFA_REGS_CAPS_ADMIN_CMD_TO_MASK) >>
+                 EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT;
+       if (timeout)
+               /* the resolution of timeout reg is 100ms */
+               edev->aq.completion_timeout = timeout * 100000;
+       else
+               edev->aq.completion_timeout = ADMIN_CMD_TIMEOUT_US;
+
+       return 0;
+}
diff --git a/drivers/infiniband/hw/efa/efa_com.h b/drivers/infiniband/hw/efa/efa_com.h
new file mode 100644 (file)
index 0000000..84d9672
--- /dev/null
@@ -0,0 +1,144 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_H_
+#define _EFA_COM_H_
+
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/dma-mapping.h>
+#include <linux/semaphore.h>
+#include <linux/sched.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "efa_common_defs.h"
+#include "efa_admin_defs.h"
+#include "efa_admin_cmds_defs.h"
+#include "efa_regs_defs.h"
+
+#define EFA_MAX_HANDLERS 256
+
+struct efa_com_admin_cq {
+       struct efa_admin_acq_entry *entries;
+       dma_addr_t dma_addr;
+       spinlock_t lock; /* Protects ACQ */
+
+       u16 cc; /* consumer counter */
+       u8 phase;
+};
+
+struct efa_com_admin_sq {
+       struct efa_admin_aq_entry *entries;
+       dma_addr_t dma_addr;
+       spinlock_t lock; /* Protects ASQ */
+
+       u32 __iomem *db_addr;
+
+       u16 cc; /* consumer counter */
+       u16 pc; /* producer counter */
+       u8 phase;
+
+};
+
+/* Don't use anything other than atomic64 */
+struct efa_com_stats_admin {
+       atomic64_t aborted_cmd;
+       atomic64_t submitted_cmd;
+       atomic64_t completed_cmd;
+       atomic64_t no_completion;
+};
+
+enum {
+       EFA_AQ_STATE_RUNNING_BIT = 0,
+       EFA_AQ_STATE_POLLING_BIT = 1,
+};
+
+struct efa_com_admin_queue {
+       void *dmadev;
+       void *efa_dev;
+       struct efa_comp_ctx *comp_ctx;
+       u32 completion_timeout; /* usecs */
+       u16 poll_interval; /* msecs */
+       u16 depth;
+       struct efa_com_admin_cq cq;
+       struct efa_com_admin_sq sq;
+       u16 msix_vector_idx;
+
+       unsigned long state;
+
+       /* Count the number of available admin commands */
+       struct semaphore avail_cmds;
+
+       struct efa_com_stats_admin stats;
+
+       spinlock_t comp_ctx_lock; /* Protects completion context pool */
+       u32 *comp_ctx_pool;
+       u16 comp_ctx_pool_next;
+};
+
+struct efa_aenq_handlers;
+
+struct efa_com_aenq {
+       struct efa_admin_aenq_entry *entries;
+       struct efa_aenq_handlers *aenq_handlers;
+       dma_addr_t dma_addr;
+       u32 cc; /* consumer counter */
+       u16 msix_vector_idx;
+       u16 depth;
+       u8 phase;
+};
+
+struct efa_com_mmio_read {
+       struct efa_admin_mmio_req_read_less_resp *read_resp;
+       dma_addr_t read_resp_dma_addr;
+       u16 seq_num;
+       u16 mmio_read_timeout; /* usecs */
+       /* serializes mmio reads */
+       spinlock_t lock;
+};
+
+struct efa_com_dev {
+       struct efa_com_admin_queue aq;
+       struct efa_com_aenq aenq;
+       u8 __iomem *reg_bar;
+       void *dmadev;
+       void *efa_dev;
+       u32 supported_features;
+       u32 dma_addr_bits;
+
+       struct efa_com_mmio_read mmio_read;
+};
+
+typedef void (*efa_aenq_handler)(void *data,
+             struct efa_admin_aenq_entry *aenq_e);
+
+/* Holds aenq handlers. Indexed by AENQ event group */
+struct efa_aenq_handlers {
+       efa_aenq_handler handlers[EFA_MAX_HANDLERS];
+       efa_aenq_handler unimplemented_handler;
+};
+
+int efa_com_admin_init(struct efa_com_dev *edev,
+                      struct efa_aenq_handlers *aenq_handlers);
+void efa_com_admin_destroy(struct efa_com_dev *edev);
+int efa_com_dev_reset(struct efa_com_dev *edev,
+                     enum efa_regs_reset_reason_types reset_reason);
+void efa_com_set_admin_polling_mode(struct efa_com_dev *edev, bool polling);
+void efa_com_admin_q_comp_intr_handler(struct efa_com_dev *edev);
+int efa_com_mmio_reg_read_init(struct efa_com_dev *edev);
+void efa_com_mmio_reg_read_destroy(struct efa_com_dev *edev);
+
+int efa_com_validate_version(struct efa_com_dev *edev);
+int efa_com_get_dma_width(struct efa_com_dev *edev);
+
+int efa_com_cmd_exec(struct efa_com_admin_queue *aq,
+                    struct efa_admin_aq_entry *cmd,
+                    size_t cmd_size,
+                    struct efa_admin_acq_entry *comp,
+                    size_t comp_size);
+void efa_com_aenq_intr_handler(struct efa_com_dev *edev, void *data);
+
+#endif /* _EFA_COM_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.c b/drivers/infiniband/hw/efa/efa_com_cmd.c
new file mode 100644 (file)
index 0000000..1422772
--- /dev/null
@@ -0,0 +1,692 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include "efa.h"
+#include "efa_com.h"
+#include "efa_com_cmd.h"
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low)
+{
+       *addr_low = lower_32_bits(addr);
+       *addr_high = upper_32_bits(addr);
+}
+
+int efa_com_create_qp(struct efa_com_dev *edev,
+                     struct efa_com_create_qp_params *params,
+                     struct efa_com_create_qp_result *res)
+{
+       struct efa_admin_create_qp_cmd create_qp_cmd = {};
+       struct efa_admin_create_qp_resp cmd_completion;
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int err;
+
+       create_qp_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_QP;
+
+       create_qp_cmd.pd = params->pd;
+       create_qp_cmd.qp_type = params->qp_type;
+       create_qp_cmd.rq_base_addr = params->rq_base_addr;
+       create_qp_cmd.send_cq_idx = params->send_cq_idx;
+       create_qp_cmd.recv_cq_idx = params->recv_cq_idx;
+       create_qp_cmd.qp_alloc_size.send_queue_ring_size =
+               params->sq_ring_size_in_bytes;
+       create_qp_cmd.qp_alloc_size.send_queue_depth =
+                       params->sq_depth;
+       create_qp_cmd.qp_alloc_size.recv_queue_ring_size =
+                       params->rq_ring_size_in_bytes;
+       create_qp_cmd.qp_alloc_size.recv_queue_depth =
+                       params->rq_depth;
+       create_qp_cmd.uar = params->uarn;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&create_qp_cmd,
+                              sizeof(create_qp_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to create qp [%d]\n", err);
+               return err;
+       }
+
+       res->qp_handle = cmd_completion.qp_handle;
+       res->qp_num = cmd_completion.qp_num;
+       res->sq_db_offset = cmd_completion.sq_db_offset;
+       res->rq_db_offset = cmd_completion.rq_db_offset;
+       res->llq_descriptors_offset = cmd_completion.llq_descriptors_offset;
+       res->send_sub_cq_idx = cmd_completion.send_sub_cq_idx;
+       res->recv_sub_cq_idx = cmd_completion.recv_sub_cq_idx;
+
+       return err;
+}
+
+int efa_com_modify_qp(struct efa_com_dev *edev,
+                     struct efa_com_modify_qp_params *params)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_modify_qp_cmd cmd = {};
+       struct efa_admin_modify_qp_resp resp;
+       int err;
+
+       cmd.aq_common_desc.opcode = EFA_ADMIN_MODIFY_QP;
+       cmd.modify_mask = params->modify_mask;
+       cmd.qp_handle = params->qp_handle;
+       cmd.qp_state = params->qp_state;
+       cmd.cur_qp_state = params->cur_qp_state;
+       cmd.qkey = params->qkey;
+       cmd.sq_psn = params->sq_psn;
+       cmd.sq_drained_async_notify = params->sq_drained_async_notify;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev,
+                         "Failed to modify qp-%u modify_mask[%#x] [%d]\n",
+                         cmd.qp_handle, cmd.modify_mask, err);
+               return err;
+       }
+
+       return 0;
+}
+
+int efa_com_query_qp(struct efa_com_dev *edev,
+                    struct efa_com_query_qp_params *params,
+                    struct efa_com_query_qp_result *result)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_query_qp_cmd cmd = {};
+       struct efa_admin_query_qp_resp resp;
+       int err;
+
+       cmd.aq_common_desc.opcode = EFA_ADMIN_QUERY_QP;
+       cmd.qp_handle = params->qp_handle;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to query qp-%u [%d]\n",
+                         cmd.qp_handle, err);
+               return err;
+       }
+
+       result->qp_state = resp.qp_state;
+       result->qkey = resp.qkey;
+       result->sq_draining = resp.sq_draining;
+       result->sq_psn = resp.sq_psn;
+
+       return 0;
+}
+
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+                      struct efa_com_destroy_qp_params *params)
+{
+       struct efa_admin_destroy_qp_resp cmd_completion;
+       struct efa_admin_destroy_qp_cmd qp_cmd = {};
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int err;
+
+       qp_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_QP;
+       qp_cmd.qp_handle = params->qp_handle;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&qp_cmd,
+                              sizeof(qp_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err)
+               ibdev_err(edev->efa_dev, "Failed to destroy qp-%u [%d]\n",
+                         qp_cmd.qp_handle, err);
+
+       return 0;
+}
+
+int efa_com_create_cq(struct efa_com_dev *edev,
+                     struct efa_com_create_cq_params *params,
+                     struct efa_com_create_cq_result *result)
+{
+       struct efa_admin_create_cq_resp cmd_completion;
+       struct efa_admin_create_cq_cmd create_cmd = {};
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int err;
+
+       create_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_CQ;
+       create_cmd.cq_caps_2 = (params->entry_size_in_bytes / 4) &
+                               EFA_ADMIN_CREATE_CQ_CMD_CQ_ENTRY_SIZE_WORDS_MASK;
+       create_cmd.cq_depth = params->cq_depth;
+       create_cmd.num_sub_cqs = params->num_sub_cqs;
+       create_cmd.uar = params->uarn;
+
+       efa_com_set_dma_addr(params->dma_addr,
+                            &create_cmd.cq_ba.mem_addr_high,
+                            &create_cmd.cq_ba.mem_addr_low);
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&create_cmd,
+                              sizeof(create_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to create cq[%d]\n", err);
+               return err;
+       }
+
+       result->cq_idx = cmd_completion.cq_idx;
+       result->actual_depth = params->cq_depth;
+
+       return err;
+}
+
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+                      struct efa_com_destroy_cq_params *params)
+{
+       struct efa_admin_destroy_cq_cmd destroy_cmd = {};
+       struct efa_admin_destroy_cq_resp destroy_resp;
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int err;
+
+       destroy_cmd.cq_idx = params->cq_idx;
+       destroy_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_CQ;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&destroy_cmd,
+                              sizeof(destroy_cmd),
+                              (struct efa_admin_acq_entry *)&destroy_resp,
+                              sizeof(destroy_resp));
+
+       if (err)
+               ibdev_err(edev->efa_dev, "Failed to destroy CQ-%u [%d]\n",
+                         params->cq_idx, err);
+
+       return 0;
+}
+
+int efa_com_register_mr(struct efa_com_dev *edev,
+                       struct efa_com_reg_mr_params *params,
+                       struct efa_com_reg_mr_result *result)
+{
+       struct efa_admin_reg_mr_resp cmd_completion;
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_reg_mr_cmd mr_cmd = {};
+       int err;
+
+       mr_cmd.aq_common_desc.opcode = EFA_ADMIN_REG_MR;
+       mr_cmd.pd = params->pd;
+       mr_cmd.mr_length = params->mr_length_in_bytes;
+       mr_cmd.flags |= params->page_shift &
+               EFA_ADMIN_REG_MR_CMD_PHYS_PAGE_SIZE_SHIFT_MASK;
+       mr_cmd.iova = params->iova;
+       mr_cmd.permissions |= params->permissions &
+                             EFA_ADMIN_REG_MR_CMD_LOCAL_WRITE_ENABLE_MASK;
+
+       if (params->inline_pbl) {
+               memcpy(mr_cmd.pbl.inline_pbl_array,
+                      params->pbl.inline_pbl_array,
+                      sizeof(mr_cmd.pbl.inline_pbl_array));
+       } else {
+               mr_cmd.pbl.pbl.length = params->pbl.pbl.length;
+               mr_cmd.pbl.pbl.address.mem_addr_low =
+                       params->pbl.pbl.address.mem_addr_low;
+               mr_cmd.pbl.pbl.address.mem_addr_high =
+                       params->pbl.pbl.address.mem_addr_high;
+               mr_cmd.aq_common_desc.flags |=
+                       EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_MASK;
+               if (params->indirect)
+                       mr_cmd.aq_common_desc.flags |=
+                               EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+       }
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&mr_cmd,
+                              sizeof(mr_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to register mr [%d]\n", err);
+               return err;
+       }
+
+       result->l_key = cmd_completion.l_key;
+       result->r_key = cmd_completion.r_key;
+
+       return 0;
+}
+
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+                    struct efa_com_dereg_mr_params *params)
+{
+       struct efa_admin_dereg_mr_resp cmd_completion;
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_dereg_mr_cmd mr_cmd = {};
+       int err;
+
+       mr_cmd.aq_common_desc.opcode = EFA_ADMIN_DEREG_MR;
+       mr_cmd.l_key = params->l_key;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&mr_cmd,
+                              sizeof(mr_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err)
+               ibdev_err(edev->efa_dev,
+                         "Failed to de-register mr(lkey-%u) [%d]\n",
+                         mr_cmd.l_key, err);
+
+       return 0;
+}
+
+int efa_com_create_ah(struct efa_com_dev *edev,
+                     struct efa_com_create_ah_params *params,
+                     struct efa_com_create_ah_result *result)
+{
+       struct efa_admin_create_ah_resp cmd_completion;
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_create_ah_cmd ah_cmd = {};
+       int err;
+
+       ah_cmd.aq_common_desc.opcode = EFA_ADMIN_CREATE_AH;
+
+       memcpy(ah_cmd.dest_addr, params->dest_addr, sizeof(ah_cmd.dest_addr));
+       ah_cmd.pd = params->pdn;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&ah_cmd,
+                              sizeof(ah_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to create ah [%d]\n", err);
+               return err;
+       }
+
+       result->ah = cmd_completion.ah;
+
+       return 0;
+}
+
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+                      struct efa_com_destroy_ah_params *params)
+{
+       struct efa_admin_destroy_ah_resp cmd_completion;
+       struct efa_admin_destroy_ah_cmd ah_cmd = {};
+       struct efa_com_admin_queue *aq = &edev->aq;
+       int err;
+
+       ah_cmd.aq_common_desc.opcode = EFA_ADMIN_DESTROY_AH;
+       ah_cmd.ah = params->ah;
+       ah_cmd.pd = params->pdn;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&ah_cmd,
+                              sizeof(ah_cmd),
+                              (struct efa_admin_acq_entry *)&cmd_completion,
+                              sizeof(cmd_completion));
+       if (err)
+               ibdev_err(edev->efa_dev, "Failed to destroy ah-%d pd-%d [%d]\n",
+                         ah_cmd.ah, ah_cmd.pd, err);
+
+       return 0;
+}
+
+static bool
+efa_com_check_supported_feature_id(struct efa_com_dev *edev,
+                                  enum efa_admin_aq_feature_id feature_id)
+{
+       u32 feature_mask = 1 << feature_id;
+
+       /* Device attributes is always supported */
+       if (feature_id != EFA_ADMIN_DEVICE_ATTR &&
+           !(edev->supported_features & feature_mask))
+               return false;
+
+       return true;
+}
+
+static int efa_com_get_feature_ex(struct efa_com_dev *edev,
+                                 struct efa_admin_get_feature_resp *get_resp,
+                                 enum efa_admin_aq_feature_id feature_id,
+                                 dma_addr_t control_buf_dma_addr,
+                                 u32 control_buff_size)
+{
+       struct efa_admin_get_feature_cmd get_cmd = {};
+       struct efa_com_admin_queue *aq;
+       int err;
+
+       if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+               ibdev_err(edev->efa_dev, "Feature %d isn't supported\n",
+                         feature_id);
+               return -EOPNOTSUPP;
+       }
+
+       aq = &edev->aq;
+
+       get_cmd.aq_common_descriptor.opcode = EFA_ADMIN_GET_FEATURE;
+
+       if (control_buff_size)
+               get_cmd.aq_common_descriptor.flags =
+                       EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+
+
+       efa_com_set_dma_addr(control_buf_dma_addr,
+                            &get_cmd.control_buffer.address.mem_addr_high,
+                            &get_cmd.control_buffer.address.mem_addr_low);
+
+       get_cmd.control_buffer.length = control_buff_size;
+       get_cmd.feature_common.feature_id = feature_id;
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)
+                              &get_cmd,
+                              sizeof(get_cmd),
+                              (struct efa_admin_acq_entry *)
+                              get_resp,
+                              sizeof(*get_resp));
+
+       if (err)
+               ibdev_err(edev->efa_dev,
+                         "Failed to submit get_feature command %d [%d]\n",
+                         feature_id, err);
+
+       return 0;
+}
+
+static int efa_com_get_feature(struct efa_com_dev *edev,
+                              struct efa_admin_get_feature_resp *get_resp,
+                              enum efa_admin_aq_feature_id feature_id)
+{
+       return efa_com_get_feature_ex(edev, get_resp, feature_id, 0, 0);
+}
+
+int efa_com_get_network_attr(struct efa_com_dev *edev,
+                            struct efa_com_get_network_attr_result *result)
+{
+       struct efa_admin_get_feature_resp resp;
+       int err;
+
+       err = efa_com_get_feature(edev, &resp,
+                                 EFA_ADMIN_NETWORK_ATTR);
+       if (err) {
+               ibdev_err(edev->efa_dev,
+                         "Failed to get network attributes %d\n", err);
+               return err;
+       }
+
+       memcpy(result->addr, resp.u.network_attr.addr,
+              sizeof(resp.u.network_attr.addr));
+       result->mtu = resp.u.network_attr.mtu;
+
+       return 0;
+}
+
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+                           struct efa_com_get_device_attr_result *result)
+{
+       struct efa_admin_get_feature_resp resp;
+       int err;
+
+       err = efa_com_get_feature(edev, &resp, EFA_ADMIN_DEVICE_ATTR);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to get device attributes %d\n",
+                         err);
+               return err;
+       }
+
+       result->page_size_cap = resp.u.device_attr.page_size_cap;
+       result->fw_version = resp.u.device_attr.fw_version;
+       result->admin_api_version = resp.u.device_attr.admin_api_version;
+       result->device_version = resp.u.device_attr.device_version;
+       result->supported_features = resp.u.device_attr.supported_features;
+       result->phys_addr_width = resp.u.device_attr.phys_addr_width;
+       result->virt_addr_width = resp.u.device_attr.virt_addr_width;
+       result->db_bar = resp.u.device_attr.db_bar;
+
+       if (result->admin_api_version < 1) {
+               ibdev_err(edev->efa_dev,
+                         "Failed to get device attr api version [%u < 1]\n",
+                         result->admin_api_version);
+               return -EINVAL;
+       }
+
+       edev->supported_features = resp.u.device_attr.supported_features;
+       err = efa_com_get_feature(edev, &resp,
+                                 EFA_ADMIN_QUEUE_ATTR);
+       if (err) {
+               ibdev_err(edev->efa_dev,
+                         "Failed to get network attributes %d\n", err);
+               return err;
+       }
+
+       result->max_qp = resp.u.queue_attr.max_qp;
+       result->max_sq_depth = resp.u.queue_attr.max_sq_depth;
+       result->max_rq_depth = resp.u.queue_attr.max_rq_depth;
+       result->max_cq = resp.u.queue_attr.max_cq;
+       result->max_cq_depth = resp.u.queue_attr.max_cq_depth;
+       result->inline_buf_size = resp.u.queue_attr.inline_buf_size;
+       result->max_sq_sge = resp.u.queue_attr.max_wr_send_sges;
+       result->max_rq_sge = resp.u.queue_attr.max_wr_recv_sges;
+       result->max_mr = resp.u.queue_attr.max_mr;
+       result->max_mr_pages = resp.u.queue_attr.max_mr_pages;
+       result->max_pd = resp.u.queue_attr.max_pd;
+       result->max_ah = resp.u.queue_attr.max_ah;
+       result->max_llq_size = resp.u.queue_attr.max_llq_size;
+       result->sub_cqs_per_cq = resp.u.queue_attr.sub_cqs_per_cq;
+
+       return 0;
+}
+
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+                        struct efa_com_get_hw_hints_result *result)
+{
+       struct efa_admin_get_feature_resp resp;
+       int err;
+
+       err = efa_com_get_feature(edev, &resp, EFA_ADMIN_HW_HINTS);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to get hw hints %d\n", err);
+               return err;
+       }
+
+       result->admin_completion_timeout = resp.u.hw_hints.admin_completion_timeout;
+       result->driver_watchdog_timeout = resp.u.hw_hints.driver_watchdog_timeout;
+       result->mmio_read_timeout = resp.u.hw_hints.mmio_read_timeout;
+       result->poll_interval = resp.u.hw_hints.poll_interval;
+
+       return 0;
+}
+
+static int efa_com_set_feature_ex(struct efa_com_dev *edev,
+                                 struct efa_admin_set_feature_resp *set_resp,
+                                 struct efa_admin_set_feature_cmd *set_cmd,
+                                 enum efa_admin_aq_feature_id feature_id,
+                                 dma_addr_t control_buf_dma_addr,
+                                 u32 control_buff_size)
+{
+       struct efa_com_admin_queue *aq;
+       int err;
+
+       if (!efa_com_check_supported_feature_id(edev, feature_id)) {
+               ibdev_err(edev->efa_dev, "Feature %d isn't supported\n",
+                         feature_id);
+               return -EOPNOTSUPP;
+       }
+
+       aq = &edev->aq;
+
+       set_cmd->aq_common_descriptor.opcode = EFA_ADMIN_SET_FEATURE;
+       if (control_buff_size) {
+               set_cmd->aq_common_descriptor.flags =
+                       EFA_ADMIN_AQ_COMMON_DESC_CTRL_DATA_INDIRECT_MASK;
+               efa_com_set_dma_addr(control_buf_dma_addr,
+                                    &set_cmd->control_buffer.address.mem_addr_high,
+                                    &set_cmd->control_buffer.address.mem_addr_low);
+       }
+
+       set_cmd->control_buffer.length = control_buff_size;
+       set_cmd->feature_common.feature_id = feature_id;
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)set_cmd,
+                              sizeof(*set_cmd),
+                              (struct efa_admin_acq_entry *)set_resp,
+                              sizeof(*set_resp));
+
+       if (err)
+               ibdev_err(edev->efa_dev,
+                         "Failed to submit set_feature command %d error: %d\n",
+                         feature_id, err);
+
+       return 0;
+}
+
+static int efa_com_set_feature(struct efa_com_dev *edev,
+                              struct efa_admin_set_feature_resp *set_resp,
+                              struct efa_admin_set_feature_cmd *set_cmd,
+                              enum efa_admin_aq_feature_id feature_id)
+{
+       return efa_com_set_feature_ex(edev, set_resp, set_cmd, feature_id,
+                                     0, 0);
+}
+
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups)
+{
+       struct efa_admin_get_feature_resp get_resp;
+       struct efa_admin_set_feature_resp set_resp;
+       struct efa_admin_set_feature_cmd cmd = {};
+       int err;
+
+       ibdev_dbg(edev->efa_dev, "Configuring aenq with groups[%#x]\n", groups);
+
+       err = efa_com_get_feature(edev, &get_resp, EFA_ADMIN_AENQ_CONFIG);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to get aenq attributes: %d\n",
+                         err);
+               return err;
+       }
+
+       ibdev_dbg(edev->efa_dev,
+                 "Get aenq groups: supported[%#x] enabled[%#x]\n",
+                 get_resp.u.aenq.supported_groups,
+                 get_resp.u.aenq.enabled_groups);
+
+       if ((get_resp.u.aenq.supported_groups & groups) != groups) {
+               ibdev_err(edev->efa_dev,
+                         "Trying to set unsupported aenq groups[%#x] supported[%#x]\n",
+                         groups, get_resp.u.aenq.supported_groups);
+               return -EOPNOTSUPP;
+       }
+
+       cmd.u.aenq.enabled_groups = groups;
+       err = efa_com_set_feature(edev, &set_resp, &cmd,
+                                 EFA_ADMIN_AENQ_CONFIG);
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to set aenq attributes: %d\n",
+                         err);
+               return err;
+       }
+
+       return 0;
+}
+
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+                    struct efa_com_alloc_pd_result *result)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_alloc_pd_cmd cmd = {};
+       struct efa_admin_alloc_pd_resp resp;
+       int err;
+
+       cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_PD;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to allocate pd[%d]\n", err);
+               return err;
+       }
+
+       result->pdn = resp.pd;
+
+       return 0;
+}
+
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+                      struct efa_com_dealloc_pd_params *params)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_dealloc_pd_cmd cmd = {};
+       struct efa_admin_dealloc_pd_resp resp;
+       int err;
+
+       cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_PD;
+       cmd.pd = params->pdn;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to deallocate pd-%u [%d]\n",
+                         cmd.pd, err);
+               return err;
+       }
+
+       return 0;
+}
+
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+                     struct efa_com_alloc_uar_result *result)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_alloc_uar_cmd cmd = {};
+       struct efa_admin_alloc_uar_resp resp;
+       int err;
+
+       cmd.aq_common_descriptor.opcode = EFA_ADMIN_ALLOC_UAR;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to allocate uar[%d]\n", err);
+               return err;
+       }
+
+       result->uarn = resp.uar;
+
+       return 0;
+}
+
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+                       struct efa_com_dealloc_uar_params *params)
+{
+       struct efa_com_admin_queue *aq = &edev->aq;
+       struct efa_admin_dealloc_uar_cmd cmd = {};
+       struct efa_admin_dealloc_uar_resp resp;
+       int err;
+
+       cmd.aq_common_descriptor.opcode = EFA_ADMIN_DEALLOC_UAR;
+       cmd.uar = params->uarn;
+
+       err = efa_com_cmd_exec(aq,
+                              (struct efa_admin_aq_entry *)&cmd,
+                              sizeof(cmd),
+                              (struct efa_admin_acq_entry *)&resp,
+                              sizeof(resp));
+       if (err) {
+               ibdev_err(edev->efa_dev, "Failed to deallocate uar-%u [%d]\n",
+                         cmd.uar, err);
+               return err;
+       }
+
+       return 0;
+}
diff --git a/drivers/infiniband/hw/efa/efa_com_cmd.h b/drivers/infiniband/hw/efa/efa_com_cmd.h
new file mode 100644 (file)
index 0000000..a117438
--- /dev/null
@@ -0,0 +1,270 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COM_CMD_H_
+#define _EFA_COM_CMD_H_
+
+#include "efa_com.h"
+
+#define EFA_GID_SIZE 16
+
+struct efa_com_create_qp_params {
+       u64 rq_base_addr;
+       u32 send_cq_idx;
+       u32 recv_cq_idx;
+       /*
+        * Send descriptor ring size in bytes,
+        * sufficient for user-provided number of WQEs and SGL size
+        */
+       u32 sq_ring_size_in_bytes;
+       /* Max number of WQEs that will be posted on send queue */
+       u32 sq_depth;
+       /* Recv descriptor ring size in bytes */
+       u32 rq_ring_size_in_bytes;
+       u32 rq_depth;
+       u16 pd;
+       u16 uarn;
+       u8 qp_type;
+};
+
+struct efa_com_create_qp_result {
+       u32 qp_handle;
+       u32 qp_num;
+       u32 sq_db_offset;
+       u32 rq_db_offset;
+       u32 llq_descriptors_offset;
+       u16 send_sub_cq_idx;
+       u16 recv_sub_cq_idx;
+};
+
+struct efa_com_modify_qp_params {
+       u32 modify_mask;
+       u32 qp_handle;
+       u32 qp_state;
+       u32 cur_qp_state;
+       u32 qkey;
+       u32 sq_psn;
+       u8 sq_drained_async_notify;
+};
+
+struct efa_com_query_qp_params {
+       u32 qp_handle;
+};
+
+struct efa_com_query_qp_result {
+       u32 qp_state;
+       u32 qkey;
+       u32 sq_draining;
+       u32 sq_psn;
+};
+
+struct efa_com_destroy_qp_params {
+       u32 qp_handle;
+};
+
+struct efa_com_create_cq_params {
+       /* cq physical base address in OS memory */
+       dma_addr_t dma_addr;
+       /* completion queue depth in # of entries */
+       u16 cq_depth;
+       u16 num_sub_cqs;
+       u16 uarn;
+       u8 entry_size_in_bytes;
+};
+
+struct efa_com_create_cq_result {
+       /* cq identifier */
+       u16 cq_idx;
+       /* actual cq depth in # of entries */
+       u16 actual_depth;
+};
+
+struct efa_com_destroy_cq_params {
+       u16 cq_idx;
+};
+
+struct efa_com_create_ah_params {
+       u16 pdn;
+       /* Destination address in network byte order */
+       u8 dest_addr[EFA_GID_SIZE];
+};
+
+struct efa_com_create_ah_result {
+       u16 ah;
+};
+
+struct efa_com_destroy_ah_params {
+       u16 ah;
+       u16 pdn;
+};
+
+struct efa_com_get_network_attr_result {
+       u8 addr[EFA_GID_SIZE];
+       u32 mtu;
+};
+
+struct efa_com_get_device_attr_result {
+       u64 page_size_cap;
+       u64 max_mr_pages;
+       u32 fw_version;
+       u32 admin_api_version;
+       u32 device_version;
+       u32 supported_features;
+       u32 phys_addr_width;
+       u32 virt_addr_width;
+       u32 max_qp;
+       u32 max_sq_depth; /* wqes */
+       u32 max_rq_depth; /* wqes */
+       u32 max_cq;
+       u32 max_cq_depth; /* cqes */
+       u32 inline_buf_size;
+       u32 max_mr;
+       u32 max_pd;
+       u32 max_ah;
+       u32 max_llq_size;
+       u16 sub_cqs_per_cq;
+       u16 max_sq_sge;
+       u16 max_rq_sge;
+       u8 db_bar;
+};
+
+struct efa_com_get_hw_hints_result {
+       u16 mmio_read_timeout;
+       u16 driver_watchdog_timeout;
+       u16 admin_completion_timeout;
+       u16 poll_interval;
+       u32 reserved[4];
+};
+
+struct efa_com_mem_addr {
+       u32 mem_addr_low;
+       u32 mem_addr_high;
+};
+
+/* Used at indirect mode page list chunks for chaining */
+struct efa_com_ctrl_buff_info {
+       /* indicates length of the buffer pointed by control_buffer_address. */
+       u32 length;
+       /* points to control buffer (direct or indirect) */
+       struct efa_com_mem_addr address;
+};
+
+struct efa_com_reg_mr_params {
+       /* Memory region length, in bytes. */
+       u64 mr_length_in_bytes;
+       /* IO Virtual Address associated with this MR. */
+       u64 iova;
+       /* words 8:15: Physical Buffer List, each element is page-aligned. */
+       union {
+               /*
+                * Inline array of physical addresses of app pages
+                * (optimization for short region reservations)
+                */
+               u64 inline_pbl_array[4];
+               /*
+                * Describes the next physically contiguous chunk of indirect
+                * page list. A page list contains physical addresses of command
+                * data pages. Data pages are 4KB; page list chunks are
+                * variable-sized.
+                */
+               struct efa_com_ctrl_buff_info pbl;
+       } pbl;
+       /* number of pages in PBL (redundant, could be calculated) */
+       u32 page_num;
+       /* Protection Domain */
+       u16 pd;
+       /*
+        * phys_page_size_shift - page size is (1 << phys_page_size_shift)
+        * Page size is used for building the Virtual to Physical
+        * address mapping
+        */
+       u8 page_shift;
+       /*
+        * permissions
+        * 0: local_write_enable - Write permissions: value of 1 needed
+        * for RQ buffers and for RDMA write:1: reserved1 - remote
+        * access flags, etc
+        */
+       u8 permissions;
+       u8 inline_pbl;
+       u8 indirect;
+};
+
+struct efa_com_reg_mr_result {
+       /*
+        * To be used in conjunction with local buffers references in SQ and
+        * RQ WQE
+        */
+       u32 l_key;
+       /*
+        * To be used in incoming RDMA semantics messages to refer to remotely
+        * accessed memory region
+        */
+       u32 r_key;
+};
+
+struct efa_com_dereg_mr_params {
+       u32 l_key;
+};
+
+struct efa_com_alloc_pd_result {
+       u16 pdn;
+};
+
+struct efa_com_dealloc_pd_params {
+       u16 pdn;
+};
+
+struct efa_com_alloc_uar_result {
+       u16 uarn;
+};
+
+struct efa_com_dealloc_uar_params {
+       u16 uarn;
+};
+
+void efa_com_set_dma_addr(dma_addr_t addr, u32 *addr_high, u32 *addr_low);
+int efa_com_create_qp(struct efa_com_dev *edev,
+                     struct efa_com_create_qp_params *params,
+                     struct efa_com_create_qp_result *res);
+int efa_com_modify_qp(struct efa_com_dev *edev,
+                     struct efa_com_modify_qp_params *params);
+int efa_com_query_qp(struct efa_com_dev *edev,
+                    struct efa_com_query_qp_params *params,
+                    struct efa_com_query_qp_result *result);
+int efa_com_destroy_qp(struct efa_com_dev *edev,
+                      struct efa_com_destroy_qp_params *params);
+int efa_com_create_cq(struct efa_com_dev *edev,
+                     struct efa_com_create_cq_params *params,
+                     struct efa_com_create_cq_result *result);
+int efa_com_destroy_cq(struct efa_com_dev *edev,
+                      struct efa_com_destroy_cq_params *params);
+int efa_com_register_mr(struct efa_com_dev *edev,
+                       struct efa_com_reg_mr_params *params,
+                       struct efa_com_reg_mr_result *result);
+int efa_com_dereg_mr(struct efa_com_dev *edev,
+                    struct efa_com_dereg_mr_params *params);
+int efa_com_create_ah(struct efa_com_dev *edev,
+                     struct efa_com_create_ah_params *params,
+                     struct efa_com_create_ah_result *result);
+int efa_com_destroy_ah(struct efa_com_dev *edev,
+                      struct efa_com_destroy_ah_params *params);
+int efa_com_get_network_attr(struct efa_com_dev *edev,
+                            struct efa_com_get_network_attr_result *result);
+int efa_com_get_device_attr(struct efa_com_dev *edev,
+                           struct efa_com_get_device_attr_result *result);
+int efa_com_get_hw_hints(struct efa_com_dev *edev,
+                        struct efa_com_get_hw_hints_result *result);
+int efa_com_set_aenq_config(struct efa_com_dev *edev, u32 groups);
+int efa_com_alloc_pd(struct efa_com_dev *edev,
+                    struct efa_com_alloc_pd_result *result);
+int efa_com_dealloc_pd(struct efa_com_dev *edev,
+                      struct efa_com_dealloc_pd_params *params);
+int efa_com_alloc_uar(struct efa_com_dev *edev,
+                     struct efa_com_alloc_uar_result *result);
+int efa_com_dealloc_uar(struct efa_com_dev *edev,
+                       struct efa_com_dealloc_uar_params *params);
+
+#endif /* _EFA_COM_CMD_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_common_defs.h b/drivers/infiniband/hw/efa/efa_common_defs.h
new file mode 100644 (file)
index 0000000..c559ec0
--- /dev/null
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_COMMON_H_
+#define _EFA_COMMON_H_
+
+#define EFA_COMMON_SPEC_VERSION_MAJOR        2
+#define EFA_COMMON_SPEC_VERSION_MINOR        0
+
+struct efa_common_mem_addr {
+       u32 mem_addr_low;
+
+       u32 mem_addr_high;
+};
+
+#endif /* _EFA_COMMON_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_main.c b/drivers/infiniband/hw/efa/efa_main.c
new file mode 100644 (file)
index 0000000..db974ca
--- /dev/null
@@ -0,0 +1,533 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_user_verbs.h>
+
+#include "efa.h"
+
+#define PCI_DEV_ID_EFA_VF 0xefa0
+
+static const struct pci_device_id efa_pci_tbl[] = {
+       { PCI_VDEVICE(AMAZON, PCI_DEV_ID_EFA_VF) },
+       { }
+};
+
+MODULE_AUTHOR("Amazon.com, Inc. or its affiliates");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION(DEVICE_NAME);
+MODULE_DEVICE_TABLE(pci, efa_pci_tbl);
+
+#define EFA_REG_BAR 0
+#define EFA_MEM_BAR 2
+#define EFA_BASE_BAR_MASK (BIT(EFA_REG_BAR) | BIT(EFA_MEM_BAR))
+
+#define EFA_AENQ_ENABLED_GROUPS \
+       (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+        BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+static void efa_update_network_attr(struct efa_dev *dev,
+                                   struct efa_com_get_network_attr_result *network_attr)
+{
+       memcpy(dev->addr, network_attr->addr, sizeof(network_attr->addr));
+       dev->mtu = network_attr->mtu;
+
+       dev_dbg(&dev->pdev->dev, "Full address %pI6\n", dev->addr);
+}
+
+/* This handler will called for unknown event group or unimplemented handlers */
+static void unimplemented_aenq_handler(void *data,
+                                      struct efa_admin_aenq_entry *aenq_e)
+{
+       struct efa_dev *dev = (struct efa_dev *)data;
+
+       ibdev_err(&dev->ibdev,
+                 "Unknown event was received or event with unimplemented handler\n");
+}
+
+static void efa_keep_alive(void *data, struct efa_admin_aenq_entry *aenq_e)
+{
+       struct efa_dev *dev = (struct efa_dev *)data;
+
+       atomic64_inc(&dev->stats.keep_alive_rcvd);
+}
+
+static struct efa_aenq_handlers aenq_handlers = {
+       .handlers = {
+               [EFA_ADMIN_KEEP_ALIVE] = efa_keep_alive,
+       },
+       .unimplemented_handler = unimplemented_aenq_handler
+};
+
+static void efa_release_bars(struct efa_dev *dev, int bars_mask)
+{
+       struct pci_dev *pdev = dev->pdev;
+       int release_bars;
+
+       release_bars = pci_select_bars(pdev, IORESOURCE_MEM) & bars_mask;
+       pci_release_selected_regions(pdev, release_bars);
+}
+
+static irqreturn_t efa_intr_msix_mgmnt(int irq, void *data)
+{
+       struct efa_dev *dev = data;
+
+       efa_com_admin_q_comp_intr_handler(&dev->edev);
+       efa_com_aenq_intr_handler(&dev->edev, data);
+
+       return IRQ_HANDLED;
+}
+
+static int efa_request_mgmnt_irq(struct efa_dev *dev)
+{
+       struct efa_irq *irq;
+       int err;
+
+       irq = &dev->admin_irq;
+       err = request_irq(irq->vector, irq->handler, 0, irq->name,
+                         irq->data);
+       if (err) {
+               dev_err(&dev->pdev->dev, "Failed to request admin irq (%d)\n",
+                       err);
+               return err;
+       }
+
+       dev_dbg(&dev->pdev->dev, "Set affinity hint of mgmnt irq to %*pbl (irq vector: %d)\n",
+               nr_cpumask_bits, &irq->affinity_hint_mask, irq->vector);
+       irq_set_affinity_hint(irq->vector, &irq->affinity_hint_mask);
+
+       return err;
+}
+
+static void efa_setup_mgmnt_irq(struct efa_dev *dev)
+{
+       u32 cpu;
+
+       snprintf(dev->admin_irq.name, EFA_IRQNAME_SIZE,
+                "efa-mgmnt@pci:%s", pci_name(dev->pdev));
+       dev->admin_irq.handler = efa_intr_msix_mgmnt;
+       dev->admin_irq.data = dev;
+       dev->admin_irq.vector =
+               pci_irq_vector(dev->pdev, dev->admin_msix_vector_idx);
+       cpu = cpumask_first(cpu_online_mask);
+       dev->admin_irq.cpu = cpu;
+       cpumask_set_cpu(cpu,
+                       &dev->admin_irq.affinity_hint_mask);
+       dev_info(&dev->pdev->dev, "Setup irq:0x%p vector:%d name:%s\n",
+                &dev->admin_irq,
+                dev->admin_irq.vector,
+                dev->admin_irq.name);
+}
+
+static void efa_free_mgmnt_irq(struct efa_dev *dev)
+{
+       struct efa_irq *irq;
+
+       irq = &dev->admin_irq;
+       irq_set_affinity_hint(irq->vector, NULL);
+       free_irq(irq->vector, irq->data);
+}
+
+static int efa_set_mgmnt_irq(struct efa_dev *dev)
+{
+       efa_setup_mgmnt_irq(dev);
+
+       return efa_request_mgmnt_irq(dev);
+}
+
+static int efa_request_doorbell_bar(struct efa_dev *dev)
+{
+       u8 db_bar_idx = dev->dev_attr.db_bar;
+       struct pci_dev *pdev = dev->pdev;
+       int bars;
+       int err;
+
+       if (!(BIT(db_bar_idx) & EFA_BASE_BAR_MASK)) {
+               bars = pci_select_bars(pdev, IORESOURCE_MEM) & BIT(db_bar_idx);
+
+               err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+               if (err) {
+                       dev_err(&dev->pdev->dev,
+                               "pci_request_selected_regions for bar %d failed %d\n",
+                               db_bar_idx, err);
+                       return err;
+               }
+       }
+
+       dev->db_bar_addr = pci_resource_start(dev->pdev, db_bar_idx);
+       dev->db_bar_len = pci_resource_len(dev->pdev, db_bar_idx);
+
+       return 0;
+}
+
+static void efa_release_doorbell_bar(struct efa_dev *dev)
+{
+       if (!(BIT(dev->dev_attr.db_bar) & EFA_BASE_BAR_MASK))
+               efa_release_bars(dev, BIT(dev->dev_attr.db_bar));
+}
+
+static void efa_update_hw_hints(struct efa_dev *dev,
+                               struct efa_com_get_hw_hints_result *hw_hints)
+{
+       struct efa_com_dev *edev = &dev->edev;
+
+       if (hw_hints->mmio_read_timeout)
+               edev->mmio_read.mmio_read_timeout =
+                       hw_hints->mmio_read_timeout * 1000;
+
+       if (hw_hints->poll_interval)
+               edev->aq.poll_interval = hw_hints->poll_interval;
+
+       if (hw_hints->admin_completion_timeout)
+               edev->aq.completion_timeout =
+                       hw_hints->admin_completion_timeout;
+}
+
+static void efa_stats_init(struct efa_dev *dev)
+{
+       atomic64_t *s = (atomic64_t *)&dev->stats;
+       int i;
+
+       for (i = 0; i < sizeof(dev->stats) / sizeof(*s); i++, s++)
+               atomic64_set(s, 0);
+}
+
+static const struct ib_device_ops efa_dev_ops = {
+       .alloc_pd = efa_alloc_pd,
+       .alloc_ucontext = efa_alloc_ucontext,
+       .create_ah = efa_create_ah,
+       .create_cq = efa_create_cq,
+       .create_qp = efa_create_qp,
+       .dealloc_pd = efa_dealloc_pd,
+       .dealloc_ucontext = efa_dealloc_ucontext,
+       .dereg_mr = efa_dereg_mr,
+       .destroy_ah = efa_destroy_ah,
+       .destroy_cq = efa_destroy_cq,
+       .destroy_qp = efa_destroy_qp,
+       .get_link_layer = efa_port_link_layer,
+       .get_port_immutable = efa_get_port_immutable,
+       .mmap = efa_mmap,
+       .modify_qp = efa_modify_qp,
+       .query_device = efa_query_device,
+       .query_gid = efa_query_gid,
+       .query_pkey = efa_query_pkey,
+       .query_port = efa_query_port,
+       .query_qp = efa_query_qp,
+       .reg_user_mr = efa_reg_mr,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, efa_ah, ibah),
+       INIT_RDMA_OBJ_SIZE(ib_pd, efa_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_ucontext, efa_ucontext, ibucontext),
+};
+
+static int efa_ib_device_add(struct efa_dev *dev)
+{
+       struct efa_com_get_network_attr_result network_attr;
+       struct efa_com_get_hw_hints_result hw_hints;
+       struct pci_dev *pdev = dev->pdev;
+       int err;
+
+       efa_stats_init(dev);
+
+       err = efa_com_get_device_attr(&dev->edev, &dev->dev_attr);
+       if (err)
+               return err;
+
+       dev_dbg(&dev->pdev->dev, "Doorbells bar (%d)\n", dev->dev_attr.db_bar);
+       err = efa_request_doorbell_bar(dev);
+       if (err)
+               return err;
+
+       err = efa_com_get_network_attr(&dev->edev, &network_attr);
+       if (err)
+               goto err_release_doorbell_bar;
+
+       efa_update_network_attr(dev, &network_attr);
+
+       err = efa_com_get_hw_hints(&dev->edev, &hw_hints);
+       if (err)
+               goto err_release_doorbell_bar;
+
+       efa_update_hw_hints(dev, &hw_hints);
+
+       /* Try to enable all the available aenq groups */
+       err = efa_com_set_aenq_config(&dev->edev, EFA_AENQ_ENABLED_GROUPS);
+       if (err)
+               goto err_release_doorbell_bar;
+
+       dev->ibdev.owner = THIS_MODULE;
+       dev->ibdev.node_type = RDMA_NODE_UNSPECIFIED;
+       dev->ibdev.phys_port_cnt = 1;
+       dev->ibdev.num_comp_vectors = 1;
+       dev->ibdev.dev.parent = &pdev->dev;
+       dev->ibdev.uverbs_abi_ver = EFA_UVERBS_ABI_VERSION;
+
+       dev->ibdev.uverbs_cmd_mask =
+               (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+               (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+               (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+               (1ull << IB_USER_VERBS_CMD_REG_MR) |
+               (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+               (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+               (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+               (1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+               (1ull << IB_USER_VERBS_CMD_DESTROY_AH);
+
+       dev->ibdev.uverbs_ex_cmd_mask =
+               (1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+
+       dev->ibdev.driver_id = RDMA_DRIVER_EFA;
+       ib_set_device_ops(&dev->ibdev, &efa_dev_ops);
+
+       err = ib_register_device(&dev->ibdev, "efa_%d");
+       if (err)
+               goto err_release_doorbell_bar;
+
+       ibdev_info(&dev->ibdev, "IB device registered\n");
+
+       return 0;
+
+err_release_doorbell_bar:
+       efa_release_doorbell_bar(dev);
+       return err;
+}
+
+static void efa_ib_device_remove(struct efa_dev *dev)
+{
+       efa_com_dev_reset(&dev->edev, EFA_REGS_RESET_NORMAL);
+       ibdev_info(&dev->ibdev, "Unregister ib device\n");
+       ib_unregister_device(&dev->ibdev);
+       efa_release_doorbell_bar(dev);
+}
+
+static void efa_disable_msix(struct efa_dev *dev)
+{
+       pci_free_irq_vectors(dev->pdev);
+}
+
+static int efa_enable_msix(struct efa_dev *dev)
+{
+       int msix_vecs, irq_num;
+
+       /* Reserve the max msix vectors we might need */
+       msix_vecs = EFA_NUM_MSIX_VEC;
+       dev_dbg(&dev->pdev->dev, "Trying to enable MSI-X, vectors %d\n",
+               msix_vecs);
+
+       dev->admin_msix_vector_idx = EFA_MGMNT_MSIX_VEC_IDX;
+       irq_num = pci_alloc_irq_vectors(dev->pdev, msix_vecs,
+                                       msix_vecs, PCI_IRQ_MSIX);
+
+       if (irq_num < 0) {
+               dev_err(&dev->pdev->dev, "Failed to enable MSI-X. irq_num %d\n",
+                       irq_num);
+               return -ENOSPC;
+       }
+
+       if (irq_num != msix_vecs) {
+               dev_err(&dev->pdev->dev,
+                       "Allocated %d MSI-X (out of %d requested)\n",
+                       irq_num, msix_vecs);
+               return -ENOSPC;
+       }
+
+       return 0;
+}
+
+static int efa_device_init(struct efa_com_dev *edev, struct pci_dev *pdev)
+{
+       int dma_width;
+       int err;
+
+       err = efa_com_dev_reset(edev, EFA_REGS_RESET_NORMAL);
+       if (err)
+               return err;
+
+       err = efa_com_validate_version(edev);
+       if (err)
+               return err;
+
+       dma_width = efa_com_get_dma_width(edev);
+       if (dma_width < 0) {
+               err = dma_width;
+               return err;
+       }
+
+       err = pci_set_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+       if (err) {
+               dev_err(&pdev->dev, "pci_set_dma_mask failed %d\n", err);
+               return err;
+       }
+
+       err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(dma_width));
+       if (err) {
+               dev_err(&pdev->dev,
+                       "err_pci_set_consistent_dma_mask failed %d\n",
+                       err);
+               return err;
+       }
+
+       return 0;
+}
+
+static struct efa_dev *efa_probe_device(struct pci_dev *pdev)
+{
+       struct efa_com_dev *edev;
+       struct efa_dev *dev;
+       int bars;
+       int err;
+
+       err = pci_enable_device_mem(pdev);
+       if (err) {
+               dev_err(&pdev->dev, "pci_enable_device_mem() failed!\n");
+               return ERR_PTR(err);
+       }
+
+       pci_set_master(pdev);
+
+       dev = ib_alloc_device(efa_dev, ibdev);
+       if (!dev) {
+               dev_err(&pdev->dev, "Device alloc failed\n");
+               err = -ENOMEM;
+               goto err_disable_device;
+       }
+
+       pci_set_drvdata(pdev, dev);
+       edev = &dev->edev;
+       edev->efa_dev = dev;
+       edev->dmadev = &pdev->dev;
+       dev->pdev = pdev;
+
+       bars = pci_select_bars(pdev, IORESOURCE_MEM) & EFA_BASE_BAR_MASK;
+       err = pci_request_selected_regions(pdev, bars, DRV_MODULE_NAME);
+       if (err) {
+               dev_err(&pdev->dev, "pci_request_selected_regions failed %d\n",
+                       err);
+               goto err_ibdev_destroy;
+       }
+
+       dev->reg_bar_addr = pci_resource_start(pdev, EFA_REG_BAR);
+       dev->reg_bar_len = pci_resource_len(pdev, EFA_REG_BAR);
+       dev->mem_bar_addr = pci_resource_start(pdev, EFA_MEM_BAR);
+       dev->mem_bar_len = pci_resource_len(pdev, EFA_MEM_BAR);
+
+       edev->reg_bar = devm_ioremap(&pdev->dev,
+                                    dev->reg_bar_addr,
+                                    dev->reg_bar_len);
+       if (!edev->reg_bar) {
+               dev_err(&pdev->dev, "Failed to remap register bar\n");
+               err = -EFAULT;
+               goto err_release_bars;
+       }
+
+       err = efa_com_mmio_reg_read_init(edev);
+       if (err) {
+               dev_err(&pdev->dev, "Failed to init readless MMIO\n");
+               goto err_iounmap;
+       }
+
+       err = efa_device_init(edev, pdev);
+       if (err) {
+               dev_err(&pdev->dev, "EFA device init failed\n");
+               if (err == -ETIME)
+                       err = -EPROBE_DEFER;
+               goto err_reg_read_destroy;
+       }
+
+       err = efa_enable_msix(dev);
+       if (err)
+               goto err_reg_read_destroy;
+
+       edev->aq.msix_vector_idx = dev->admin_msix_vector_idx;
+       edev->aenq.msix_vector_idx = dev->admin_msix_vector_idx;
+
+       err = efa_set_mgmnt_irq(dev);
+       if (err)
+               goto err_disable_msix;
+
+       err = efa_com_admin_init(edev, &aenq_handlers);
+       if (err)
+               goto err_free_mgmnt_irq;
+
+       return dev;
+
+err_free_mgmnt_irq:
+       efa_free_mgmnt_irq(dev);
+err_disable_msix:
+       efa_disable_msix(dev);
+err_reg_read_destroy:
+       efa_com_mmio_reg_read_destroy(edev);
+err_iounmap:
+       devm_iounmap(&pdev->dev, edev->reg_bar);
+err_release_bars:
+       efa_release_bars(dev, EFA_BASE_BAR_MASK);
+err_ibdev_destroy:
+       ib_dealloc_device(&dev->ibdev);
+err_disable_device:
+       pci_disable_device(pdev);
+       return ERR_PTR(err);
+}
+
+static void efa_remove_device(struct pci_dev *pdev)
+{
+       struct efa_dev *dev = pci_get_drvdata(pdev);
+       struct efa_com_dev *edev;
+
+       edev = &dev->edev;
+       efa_com_admin_destroy(edev);
+       efa_free_mgmnt_irq(dev);
+       efa_disable_msix(dev);
+       efa_com_mmio_reg_read_destroy(edev);
+       devm_iounmap(&pdev->dev, edev->reg_bar);
+       efa_release_bars(dev, EFA_BASE_BAR_MASK);
+       ib_dealloc_device(&dev->ibdev);
+       pci_disable_device(pdev);
+}
+
+static int efa_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+       struct efa_dev *dev;
+       int err;
+
+       dev = efa_probe_device(pdev);
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
+
+       err = efa_ib_device_add(dev);
+       if (err)
+               goto err_remove_device;
+
+       return 0;
+
+err_remove_device:
+       efa_remove_device(pdev);
+       return err;
+}
+
+static void efa_remove(struct pci_dev *pdev)
+{
+       struct efa_dev *dev = pci_get_drvdata(pdev);
+
+       efa_ib_device_remove(dev);
+       efa_remove_device(pdev);
+}
+
+static struct pci_driver efa_pci_driver = {
+       .name           = DRV_MODULE_NAME,
+       .id_table       = efa_pci_tbl,
+       .probe          = efa_probe,
+       .remove         = efa_remove,
+};
+
+module_pci_driver(efa_pci_driver);
diff --git a/drivers/infiniband/hw/efa/efa_regs_defs.h b/drivers/infiniband/hw/efa/efa_regs_defs.h
new file mode 100644 (file)
index 0000000..bb9cad3
--- /dev/null
@@ -0,0 +1,113 @@
+/* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef _EFA_REGS_H_
+#define _EFA_REGS_H_
+
+enum efa_regs_reset_reason_types {
+       EFA_REGS_RESET_NORMAL                       = 0,
+       /* Keep alive timeout */
+       EFA_REGS_RESET_KEEP_ALIVE_TO                = 1,
+       EFA_REGS_RESET_ADMIN_TO                     = 2,
+       EFA_REGS_RESET_INIT_ERR                     = 3,
+       EFA_REGS_RESET_DRIVER_INVALID_STATE         = 4,
+       EFA_REGS_RESET_OS_TRIGGER                   = 5,
+       EFA_REGS_RESET_SHUTDOWN                     = 6,
+       EFA_REGS_RESET_USER_TRIGGER                 = 7,
+       EFA_REGS_RESET_GENERIC                      = 8,
+};
+
+/* efa_registers offsets */
+
+/* 0 base */
+#define EFA_REGS_VERSION_OFF                                0x0
+#define EFA_REGS_CONTROLLER_VERSION_OFF                     0x4
+#define EFA_REGS_CAPS_OFF                                   0x8
+#define EFA_REGS_AQ_BASE_LO_OFF                             0x10
+#define EFA_REGS_AQ_BASE_HI_OFF                             0x14
+#define EFA_REGS_AQ_CAPS_OFF                                0x18
+#define EFA_REGS_ACQ_BASE_LO_OFF                            0x20
+#define EFA_REGS_ACQ_BASE_HI_OFF                            0x24
+#define EFA_REGS_ACQ_CAPS_OFF                               0x28
+#define EFA_REGS_AQ_PROD_DB_OFF                             0x2c
+#define EFA_REGS_AENQ_CAPS_OFF                              0x34
+#define EFA_REGS_AENQ_BASE_LO_OFF                           0x38
+#define EFA_REGS_AENQ_BASE_HI_OFF                           0x3c
+#define EFA_REGS_AENQ_CONS_DB_OFF                           0x40
+#define EFA_REGS_INTR_MASK_OFF                              0x4c
+#define EFA_REGS_DEV_CTL_OFF                                0x54
+#define EFA_REGS_DEV_STS_OFF                                0x58
+#define EFA_REGS_MMIO_REG_READ_OFF                          0x5c
+#define EFA_REGS_MMIO_RESP_LO_OFF                           0x60
+#define EFA_REGS_MMIO_RESP_HI_OFF                           0x64
+
+/* version register */
+#define EFA_REGS_VERSION_MINOR_VERSION_MASK                 0xff
+#define EFA_REGS_VERSION_MAJOR_VERSION_SHIFT                8
+#define EFA_REGS_VERSION_MAJOR_VERSION_MASK                 0xff00
+
+/* controller_version register */
+#define EFA_REGS_CONTROLLER_VERSION_SUBMINOR_VERSION_MASK   0xff
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_SHIFT     8
+#define EFA_REGS_CONTROLLER_VERSION_MINOR_VERSION_MASK      0xff00
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_SHIFT     16
+#define EFA_REGS_CONTROLLER_VERSION_MAJOR_VERSION_MASK      0xff0000
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_SHIFT           24
+#define EFA_REGS_CONTROLLER_VERSION_IMPL_ID_MASK            0xff000000
+
+/* caps register */
+#define EFA_REGS_CAPS_CONTIGUOUS_QUEUE_REQUIRED_MASK        0x1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_SHIFT                   1
+#define EFA_REGS_CAPS_RESET_TIMEOUT_MASK                    0x3e
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_SHIFT                  8
+#define EFA_REGS_CAPS_DMA_ADDR_WIDTH_MASK                   0xff00
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_SHIFT                    16
+#define EFA_REGS_CAPS_ADMIN_CMD_TO_MASK                     0xf0000
+
+/* aq_caps register */
+#define EFA_REGS_AQ_CAPS_AQ_DEPTH_MASK                      0xffff
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_SHIFT                16
+#define EFA_REGS_AQ_CAPS_AQ_ENTRY_SIZE_MASK                 0xffff0000
+
+/* acq_caps register */
+#define EFA_REGS_ACQ_CAPS_ACQ_DEPTH_MASK                    0xffff
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_SHIFT              16
+#define EFA_REGS_ACQ_CAPS_ACQ_ENTRY_SIZE_MASK               0xff0000
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_SHIFT             24
+#define EFA_REGS_ACQ_CAPS_ACQ_MSIX_VECTOR_MASK              0xff000000
+
+/* aenq_caps register */
+#define EFA_REGS_AENQ_CAPS_AENQ_DEPTH_MASK                  0xffff
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_SHIFT            16
+#define EFA_REGS_AENQ_CAPS_AENQ_ENTRY_SIZE_MASK             0xff0000
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_SHIFT           24
+#define EFA_REGS_AENQ_CAPS_AENQ_MSIX_VECTOR_MASK            0xff000000
+
+/* dev_ctl register */
+#define EFA_REGS_DEV_CTL_DEV_RESET_MASK                     0x1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_SHIFT                   1
+#define EFA_REGS_DEV_CTL_AQ_RESTART_MASK                    0x2
+#define EFA_REGS_DEV_CTL_RESET_REASON_SHIFT                 28
+#define EFA_REGS_DEV_CTL_RESET_REASON_MASK                  0xf0000000
+
+/* dev_sts register */
+#define EFA_REGS_DEV_STS_READY_MASK                         0x1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_SHIFT       1
+#define EFA_REGS_DEV_STS_AQ_RESTART_IN_PROGRESS_MASK        0x2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_SHIFT          2
+#define EFA_REGS_DEV_STS_AQ_RESTART_FINISHED_MASK           0x4
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_SHIFT            3
+#define EFA_REGS_DEV_STS_RESET_IN_PROGRESS_MASK             0x8
+#define EFA_REGS_DEV_STS_RESET_FINISHED_SHIFT               4
+#define EFA_REGS_DEV_STS_RESET_FINISHED_MASK                0x10
+#define EFA_REGS_DEV_STS_FATAL_ERROR_SHIFT                  5
+#define EFA_REGS_DEV_STS_FATAL_ERROR_MASK                   0x20
+
+/* mmio_reg_read register */
+#define EFA_REGS_MMIO_REG_READ_REQ_ID_MASK                  0xffff
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_SHIFT                16
+#define EFA_REGS_MMIO_REG_READ_REG_OFF_MASK                 0xffff0000
+
+#endif /* _EFA_REGS_H_ */
diff --git a/drivers/infiniband/hw/efa/efa_verbs.c b/drivers/infiniband/hw/efa/efa_verbs.c
new file mode 100644 (file)
index 0000000..6d6886c
--- /dev/null
@@ -0,0 +1,1825 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/uverbs_ioctl.h>
+
+#include "efa.h"
+
+#define EFA_MMAP_FLAG_SHIFT 56
+#define EFA_MMAP_PAGE_MASK GENMASK(EFA_MMAP_FLAG_SHIFT - 1, 0)
+#define EFA_MMAP_INVALID U64_MAX
+
+enum {
+       EFA_MMAP_DMA_PAGE = 0,
+       EFA_MMAP_IO_WC,
+       EFA_MMAP_IO_NC,
+};
+
+#define EFA_AENQ_ENABLED_GROUPS \
+       (BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
+        BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
+
+struct efa_mmap_entry {
+       void  *obj;
+       u64 address;
+       u64 length;
+       u32 mmap_page;
+       u8 mmap_flag;
+};
+
+static inline u64 get_mmap_key(const struct efa_mmap_entry *efa)
+{
+       return ((u64)efa->mmap_flag << EFA_MMAP_FLAG_SHIFT) |
+              ((u64)efa->mmap_page << PAGE_SHIFT);
+}
+
+#define EFA_CHUNK_PAYLOAD_SHIFT       12
+#define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
+#define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
+
+#define EFA_CHUNK_SHIFT               12
+#define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
+#define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
+
+#define EFA_PTRS_PER_CHUNK \
+       ((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
+
+#define EFA_CHUNK_USED_SIZE \
+       ((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
+
+#define EFA_SUPPORTED_ACCESS_FLAGS IB_ACCESS_LOCAL_WRITE
+
+struct pbl_chunk {
+       dma_addr_t dma_addr;
+       u64 *buf;
+       u32 length;
+};
+
+struct pbl_chunk_list {
+       struct pbl_chunk *chunks;
+       unsigned int size;
+};
+
+struct pbl_context {
+       union {
+               struct {
+                       dma_addr_t dma_addr;
+               } continuous;
+               struct {
+                       u32 pbl_buf_size_in_pages;
+                       struct scatterlist *sgl;
+                       int sg_dma_cnt;
+                       struct pbl_chunk_list chunk_list;
+               } indirect;
+       } phys;
+       u64 *pbl_buf;
+       u32 pbl_buf_size_in_bytes;
+       u8 physically_continuous;
+};
+
+static inline struct efa_dev *to_edev(struct ib_device *ibdev)
+{
+       return container_of(ibdev, struct efa_dev, ibdev);
+}
+
+static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
+{
+       return container_of(ibucontext, struct efa_ucontext, ibucontext);
+}
+
+static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
+{
+       return container_of(ibpd, struct efa_pd, ibpd);
+}
+
+static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
+{
+       return container_of(ibmr, struct efa_mr, ibmr);
+}
+
+static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct efa_qp, ibqp);
+}
+
+static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
+{
+       return container_of(ibcq, struct efa_cq, ibcq);
+}
+
+static inline struct efa_ah *to_eah(struct ib_ah *ibah)
+{
+       return container_of(ibah, struct efa_ah, ibah);
+}
+
+#define field_avail(x, fld, sz) (offsetof(typeof(x), fld) + \
+                                sizeof(((typeof(x) *)0)->fld) <= (sz))
+
+#define is_reserved_cleared(reserved) \
+       !memchr_inv(reserved, 0, sizeof(reserved))
+
+static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
+                              size_t size, enum dma_data_direction dir)
+{
+       void *addr;
+
+       addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
+       if (!addr)
+               return NULL;
+
+       *dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
+       if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
+               ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
+               free_pages_exact(addr, size);
+               return NULL;
+       }
+
+       return addr;
+}
+
+/*
+ * This is only called when the ucontext is destroyed and there can be no
+ * concurrent query via mmap or allocate on the xarray, thus we can be sure no
+ * other thread is using the entry pointer. We also know that all the BAR
+ * pages have either been zap'd or munmaped at this point.  Normal pages are
+ * refcounted and will be freed at the proper time.
+ */
+static void mmap_entries_remove_free(struct efa_dev *dev,
+                                    struct efa_ucontext *ucontext)
+{
+       struct efa_mmap_entry *entry;
+       unsigned long mmap_page;
+
+       xa_for_each(&ucontext->mmap_xa, mmap_page, entry) {
+               xa_erase(&ucontext->mmap_xa, mmap_page);
+
+               ibdev_dbg(
+                       &dev->ibdev,
+                       "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+                       entry->obj, get_mmap_key(entry), entry->address,
+                       entry->length);
+               if (entry->mmap_flag == EFA_MMAP_DMA_PAGE)
+                       /* DMA mapping is already gone, now free the pages */
+                       free_pages_exact(phys_to_virt(entry->address),
+                                        entry->length);
+               kfree(entry);
+       }
+}
+
+static struct efa_mmap_entry *mmap_entry_get(struct efa_dev *dev,
+                                            struct efa_ucontext *ucontext,
+                                            u64 key, u64 len)
+{
+       struct efa_mmap_entry *entry;
+       u64 mmap_page;
+
+       mmap_page = (key & EFA_MMAP_PAGE_MASK) >> PAGE_SHIFT;
+       if (mmap_page > U32_MAX)
+               return NULL;
+
+       entry = xa_load(&ucontext->mmap_xa, mmap_page);
+       if (!entry || get_mmap_key(entry) != key || entry->length != len)
+               return NULL;
+
+       ibdev_dbg(&dev->ibdev,
+                 "mmap: obj[0x%p] key[%#llx] addr[%#llx] len[%#llx] removed\n",
+                 entry->obj, key, entry->address, entry->length);
+
+       return entry;
+}
+
+/*
+ * Note this locking scheme cannot support removal of entries, except during
+ * ucontext destruction when the core code guarentees no concurrency.
+ */
+static u64 mmap_entry_insert(struct efa_dev *dev, struct efa_ucontext *ucontext,
+                            void *obj, u64 address, u64 length, u8 mmap_flag)
+{
+       struct efa_mmap_entry *entry;
+       int err;
+
+       entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+       if (!entry)
+               return EFA_MMAP_INVALID;
+
+       entry->obj = obj;
+       entry->address = address;
+       entry->length = length;
+       entry->mmap_flag = mmap_flag;
+
+       xa_lock(&ucontext->mmap_xa);
+       entry->mmap_page = ucontext->mmap_xa_page;
+       ucontext->mmap_xa_page += DIV_ROUND_UP(length, PAGE_SIZE);
+       err = __xa_insert(&ucontext->mmap_xa, entry->mmap_page, entry,
+                         GFP_KERNEL);
+       xa_unlock(&ucontext->mmap_xa);
+       if (err){
+               kfree(entry);
+               return EFA_MMAP_INVALID;
+       }
+
+       ibdev_dbg(
+               &dev->ibdev,
+               "mmap: obj[0x%p] addr[%#llx], len[%#llx], key[%#llx] inserted\n",
+               entry->obj, entry->address, entry->length, get_mmap_key(entry));
+
+       return get_mmap_key(entry);
+}
+
+int efa_query_device(struct ib_device *ibdev,
+                    struct ib_device_attr *props,
+                    struct ib_udata *udata)
+{
+       struct efa_com_get_device_attr_result *dev_attr;
+       struct efa_ibv_ex_query_device_resp resp = {};
+       struct efa_dev *dev = to_edev(ibdev);
+       int err;
+
+       if (udata && udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(ibdev,
+                         "Incompatible ABI params, udata not cleared\n");
+               return -EINVAL;
+       }
+
+       dev_attr = &dev->dev_attr;
+
+       memset(props, 0, sizeof(*props));
+       props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
+       props->page_size_cap = dev_attr->page_size_cap;
+       props->vendor_id = dev->pdev->vendor;
+       props->vendor_part_id = dev->pdev->device;
+       props->hw_ver = dev->pdev->subsystem_device;
+       props->max_qp = dev_attr->max_qp;
+       props->max_cq = dev_attr->max_cq;
+       props->max_pd = dev_attr->max_pd;
+       props->max_mr = dev_attr->max_mr;
+       props->max_ah = dev_attr->max_ah;
+       props->max_cqe = dev_attr->max_cq_depth;
+       props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
+                                dev_attr->max_rq_depth);
+       props->max_send_sge = dev_attr->max_sq_sge;
+       props->max_recv_sge = dev_attr->max_rq_sge;
+
+       if (udata && udata->outlen) {
+               resp.max_sq_sge = dev_attr->max_sq_sge;
+               resp.max_rq_sge = dev_attr->max_rq_sge;
+               resp.max_sq_wr = dev_attr->max_sq_depth;
+               resp.max_rq_wr = dev_attr->max_rq_depth;
+
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err) {
+                       ibdev_dbg(ibdev,
+                                 "Failed to copy udata for query_device\n");
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+int efa_query_port(struct ib_device *ibdev, u8 port,
+                  struct ib_port_attr *props)
+{
+       struct efa_dev *dev = to_edev(ibdev);
+
+       props->lmc = 1;
+
+       props->state = IB_PORT_ACTIVE;
+       props->phys_state = 5;
+       props->gid_tbl_len = 1;
+       props->pkey_tbl_len = 1;
+       props->active_speed = IB_SPEED_EDR;
+       props->active_width = IB_WIDTH_4X;
+       props->max_mtu = ib_mtu_int_to_enum(dev->mtu);
+       props->active_mtu = ib_mtu_int_to_enum(dev->mtu);
+       props->max_msg_sz = dev->mtu;
+       props->max_vl_num = 1;
+
+       return 0;
+}
+
+int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+                int qp_attr_mask,
+                struct ib_qp_init_attr *qp_init_attr)
+{
+       struct efa_dev *dev = to_edev(ibqp->device);
+       struct efa_com_query_qp_params params = {};
+       struct efa_com_query_qp_result result;
+       struct efa_qp *qp = to_eqp(ibqp);
+       int err;
+
+#define EFA_QUERY_QP_SUPP_MASK \
+       (IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
+        IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP)
+
+       if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
+               ibdev_dbg(&dev->ibdev,
+                         "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+                         qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
+               return -EOPNOTSUPP;
+       }
+
+       memset(qp_attr, 0, sizeof(*qp_attr));
+       memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+       params.qp_handle = qp->qp_handle;
+       err = efa_com_query_qp(&dev->edev, &params, &result);
+       if (err)
+               return err;
+
+       qp_attr->qp_state = result.qp_state;
+       qp_attr->qkey = result.qkey;
+       qp_attr->sq_psn = result.sq_psn;
+       qp_attr->sq_draining = result.sq_draining;
+       qp_attr->port_num = 1;
+
+       qp_attr->cap.max_send_wr = qp->max_send_wr;
+       qp_attr->cap.max_recv_wr = qp->max_recv_wr;
+       qp_attr->cap.max_send_sge = qp->max_send_sge;
+       qp_attr->cap.max_recv_sge = qp->max_recv_sge;
+       qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+       qp_init_attr->qp_type = ibqp->qp_type;
+       qp_init_attr->recv_cq = ibqp->recv_cq;
+       qp_init_attr->send_cq = ibqp->send_cq;
+       qp_init_attr->qp_context = ibqp->qp_context;
+       qp_init_attr->cap = qp_attr->cap;
+
+       return 0;
+}
+
+int efa_query_gid(struct ib_device *ibdev, u8 port, int index,
+                 union ib_gid *gid)
+{
+       struct efa_dev *dev = to_edev(ibdev);
+
+       memcpy(gid->raw, dev->addr, sizeof(dev->addr));
+
+       return 0;
+}
+
+int efa_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+                  u16 *pkey)
+{
+       if (index > 0)
+               return -EINVAL;
+
+       *pkey = 0xffff;
+       return 0;
+}
+
+static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
+{
+       struct efa_com_dealloc_pd_params params = {
+               .pdn = pdn,
+       };
+
+       return efa_com_dealloc_pd(&dev->edev, &params);
+}
+
+int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibpd->device);
+       struct efa_ibv_alloc_pd_resp resp = {};
+       struct efa_com_alloc_pd_result result;
+       struct efa_pd *pd = to_epd(ibpd);
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, udata not cleared\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       err = efa_com_alloc_pd(&dev->edev, &result);
+       if (err)
+               goto err_out;
+
+       pd->pdn = result.pdn;
+       resp.pdn = result.pdn;
+
+       if (udata->outlen) {
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err) {
+                       ibdev_dbg(&dev->ibdev,
+                                 "Failed to copy udata for alloc_pd\n");
+                       goto err_dealloc_pd;
+               }
+       }
+
+       ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
+
+       return 0;
+
+err_dealloc_pd:
+       efa_pd_dealloc(dev, result.pdn);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.alloc_pd_err);
+       return err;
+}
+
+void efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibpd->device);
+       struct efa_pd *pd = to_epd(ibpd);
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+               return;
+       }
+
+       ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
+       efa_pd_dealloc(dev, pd->pdn);
+}
+
+static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
+{
+       struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
+
+       return efa_com_destroy_qp(&dev->edev, &params);
+}
+
+int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibqp->pd->device);
+       struct efa_qp *qp = to_eqp(ibqp);
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+               return -EINVAL;
+       }
+
+       ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
+       err = efa_destroy_qp_handle(dev, qp->qp_handle);
+       if (err)
+               return err;
+
+       if (qp->rq_cpu_addr) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
+                         qp->rq_cpu_addr, qp->rq_size,
+                         &qp->rq_dma_addr);
+               dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
+                                DMA_TO_DEVICE);
+       }
+
+       kfree(qp);
+       return 0;
+}
+
+static int qp_mmap_entries_setup(struct efa_qp *qp,
+                                struct efa_dev *dev,
+                                struct efa_ucontext *ucontext,
+                                struct efa_com_create_qp_params *params,
+                                struct efa_ibv_create_qp_resp *resp)
+{
+       /*
+        * Once an entry is inserted it might be mmapped, hence cannot be
+        * cleaned up until dealloc_ucontext.
+        */
+       resp->sq_db_mmap_key =
+               mmap_entry_insert(dev, ucontext, qp,
+                                 dev->db_bar_addr + resp->sq_db_offset,
+                                 PAGE_SIZE, EFA_MMAP_IO_NC);
+       if (resp->sq_db_mmap_key == EFA_MMAP_INVALID)
+               return -ENOMEM;
+
+       resp->sq_db_offset &= ~PAGE_MASK;
+
+       resp->llq_desc_mmap_key =
+               mmap_entry_insert(dev, ucontext, qp,
+                                 dev->mem_bar_addr + resp->llq_desc_offset,
+                                 PAGE_ALIGN(params->sq_ring_size_in_bytes +
+                                            (resp->llq_desc_offset & ~PAGE_MASK)),
+                                 EFA_MMAP_IO_WC);
+       if (resp->llq_desc_mmap_key == EFA_MMAP_INVALID)
+               return -ENOMEM;
+
+       resp->llq_desc_offset &= ~PAGE_MASK;
+
+       if (qp->rq_size) {
+               resp->rq_db_mmap_key =
+                       mmap_entry_insert(dev, ucontext, qp,
+                                         dev->db_bar_addr + resp->rq_db_offset,
+                                         PAGE_SIZE, EFA_MMAP_IO_NC);
+               if (resp->rq_db_mmap_key == EFA_MMAP_INVALID)
+                       return -ENOMEM;
+
+               resp->rq_db_offset &= ~PAGE_MASK;
+
+               resp->rq_mmap_key =
+                       mmap_entry_insert(dev, ucontext, qp,
+                                         virt_to_phys(qp->rq_cpu_addr),
+                                         qp->rq_size, EFA_MMAP_DMA_PAGE);
+               if (resp->rq_mmap_key == EFA_MMAP_INVALID)
+                       return -ENOMEM;
+
+               resp->rq_mmap_size = qp->rq_size;
+       }
+
+       return 0;
+}
+
+static int efa_qp_validate_cap(struct efa_dev *dev,
+                              struct ib_qp_init_attr *init_attr)
+{
+       if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp: requested send wr[%u] exceeds the max[%u]\n",
+                         init_attr->cap.max_send_wr,
+                         dev->dev_attr.max_sq_depth);
+               return -EINVAL;
+       }
+       if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp: requested receive wr[%u] exceeds the max[%u]\n",
+                         init_attr->cap.max_recv_wr,
+                         dev->dev_attr.max_rq_depth);
+               return -EINVAL;
+       }
+       if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp: requested sge send[%u] exceeds the max[%u]\n",
+                         init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
+               return -EINVAL;
+       }
+       if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp: requested sge recv[%u] exceeds the max[%u]\n",
+                         init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
+               return -EINVAL;
+       }
+       if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
+               ibdev_dbg(&dev->ibdev,
+                         "qp: requested inline data[%u] exceeds the max[%u]\n",
+                         init_attr->cap.max_inline_data,
+                         dev->dev_attr.inline_buf_size);
+               return -EINVAL;
+       }
+
+       return 0;
+}
+
+static int efa_qp_validate_attr(struct efa_dev *dev,
+                               struct ib_qp_init_attr *init_attr)
+{
+       if (init_attr->qp_type != IB_QPT_DRIVER &&
+           init_attr->qp_type != IB_QPT_UD) {
+               ibdev_dbg(&dev->ibdev,
+                         "Unsupported qp type %d\n", init_attr->qp_type);
+               return -EOPNOTSUPP;
+       }
+
+       if (init_attr->srq) {
+               ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
+               return -EOPNOTSUPP;
+       }
+
+       if (init_attr->create_flags) {
+               ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+struct ib_qp *efa_create_qp(struct ib_pd *ibpd,
+                           struct ib_qp_init_attr *init_attr,
+                           struct ib_udata *udata)
+{
+       struct efa_com_create_qp_params create_qp_params = {};
+       struct efa_com_create_qp_result create_qp_resp;
+       struct efa_dev *dev = to_edev(ibpd->device);
+       struct efa_ibv_create_qp_resp resp = {};
+       struct efa_ibv_create_qp cmd = {};
+       bool rq_entry_inserted = false;
+       struct efa_ucontext *ucontext;
+       struct efa_qp *qp;
+       int err;
+
+       ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
+                                            ibucontext);
+
+       err = efa_qp_validate_cap(dev, init_attr);
+       if (err)
+               goto err_out;
+
+       err = efa_qp_validate_attr(dev, init_attr);
+       if (err)
+               goto err_out;
+
+       if (!field_avail(cmd, driver_qp_type, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, no input udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (udata->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(udata, sizeof(cmd),
+                                udata->inlen - sizeof(cmd))) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, unknown fields in udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       err = ib_copy_from_udata(&cmd, udata,
+                                min(sizeof(cmd), udata->inlen));
+       if (err) {
+               ibdev_dbg(&dev->ibdev,
+                         "Cannot copy udata for create_qp\n");
+               goto err_out;
+       }
+
+       if (cmd.comp_mask) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, unknown fields in udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+       if (!qp) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+
+       create_qp_params.uarn = ucontext->uarn;
+       create_qp_params.pd = to_epd(ibpd)->pdn;
+
+       if (init_attr->qp_type == IB_QPT_UD) {
+               create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
+       } else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
+               create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
+       } else {
+               ibdev_dbg(&dev->ibdev,
+                         "Unsupported qp type %d driver qp type %d\n",
+                         init_attr->qp_type, cmd.driver_qp_type);
+               err = -EOPNOTSUPP;
+               goto err_free_qp;
+       }
+
+       ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
+                 init_attr->qp_type, cmd.driver_qp_type);
+       create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
+       create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
+       create_qp_params.sq_depth = init_attr->cap.max_send_wr;
+       create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
+
+       create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
+       create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
+       qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
+       if (qp->rq_size) {
+               qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
+                                                   qp->rq_size, DMA_TO_DEVICE);
+               if (!qp->rq_cpu_addr) {
+                       err = -ENOMEM;
+                       goto err_free_qp;
+               }
+
+               ibdev_dbg(&dev->ibdev,
+                         "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
+                         qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
+               create_qp_params.rq_base_addr = qp->rq_dma_addr;
+       }
+
+       err = efa_com_create_qp(&dev->edev, &create_qp_params,
+                               &create_qp_resp);
+       if (err)
+               goto err_free_mapped;
+
+       resp.sq_db_offset = create_qp_resp.sq_db_offset;
+       resp.rq_db_offset = create_qp_resp.rq_db_offset;
+       resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
+       resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
+       resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
+
+       err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
+                                   &resp);
+       if (err)
+               goto err_destroy_qp;
+
+       rq_entry_inserted = true;
+       qp->qp_handle = create_qp_resp.qp_handle;
+       qp->ibqp.qp_num = create_qp_resp.qp_num;
+       qp->ibqp.qp_type = init_attr->qp_type;
+       qp->max_send_wr = init_attr->cap.max_send_wr;
+       qp->max_recv_wr = init_attr->cap.max_recv_wr;
+       qp->max_send_sge = init_attr->cap.max_send_sge;
+       qp->max_recv_sge = init_attr->cap.max_recv_sge;
+       qp->max_inline_data = init_attr->cap.max_inline_data;
+
+       if (udata->outlen) {
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err) {
+                       ibdev_dbg(&dev->ibdev,
+                                 "Failed to copy udata for qp[%u]\n",
+                                 create_qp_resp.qp_num);
+                       goto err_destroy_qp;
+               }
+       }
+
+       ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
+
+       return &qp->ibqp;
+
+err_destroy_qp:
+       efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
+err_free_mapped:
+       if (qp->rq_size) {
+               dma_unmap_single(&dev->pdev->dev, qp->rq_dma_addr, qp->rq_size,
+                                DMA_TO_DEVICE);
+               if (!rq_entry_inserted)
+                       free_pages_exact(qp->rq_cpu_addr, qp->rq_size);
+       }
+err_free_qp:
+       kfree(qp);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.create_qp_err);
+       return ERR_PTR(err);
+}
+
+static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
+                                 struct ib_qp_attr *qp_attr, int qp_attr_mask,
+                                 enum ib_qp_state cur_state,
+                                 enum ib_qp_state new_state)
+{
+#define EFA_MODIFY_QP_SUPP_MASK \
+       (IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
+        IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN)
+
+       if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
+               ibdev_dbg(&dev->ibdev,
+                         "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
+                         qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
+               return -EOPNOTSUPP;
+       }
+
+       if (!ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
+                               qp_attr_mask)) {
+               ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
+               return -EINVAL;
+       }
+
+       if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
+               ibdev_dbg(&dev->ibdev, "Can't change port num\n");
+               return -EOPNOTSUPP;
+       }
+
+       if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
+               ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
+               return -EOPNOTSUPP;
+       }
+
+       return 0;
+}
+
+int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
+                 int qp_attr_mask, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibqp->device);
+       struct efa_com_modify_qp_params params = {};
+       struct efa_qp *qp = to_eqp(ibqp);
+       enum ib_qp_state cur_state;
+       enum ib_qp_state new_state;
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, udata not cleared\n");
+               return -EINVAL;
+       }
+
+       cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
+                                                    qp->state;
+       new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
+
+       err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
+                                    new_state);
+       if (err)
+               return err;
+
+       params.qp_handle = qp->qp_handle;
+
+       if (qp_attr_mask & IB_QP_STATE) {
+               params.modify_mask |= BIT(EFA_ADMIN_QP_STATE_BIT) |
+                                     BIT(EFA_ADMIN_CUR_QP_STATE_BIT);
+               params.cur_qp_state = qp_attr->cur_qp_state;
+               params.qp_state = qp_attr->qp_state;
+       }
+
+       if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
+               params.modify_mask |=
+                       BIT(EFA_ADMIN_SQ_DRAINED_ASYNC_NOTIFY_BIT);
+               params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
+       }
+
+       if (qp_attr_mask & IB_QP_QKEY) {
+               params.modify_mask |= BIT(EFA_ADMIN_QKEY_BIT);
+               params.qkey = qp_attr->qkey;
+       }
+
+       if (qp_attr_mask & IB_QP_SQ_PSN) {
+               params.modify_mask |= BIT(EFA_ADMIN_SQ_PSN_BIT);
+               params.sq_psn = qp_attr->sq_psn;
+       }
+
+       err = efa_com_modify_qp(&dev->edev, &params);
+       if (err)
+               return err;
+
+       qp->state = new_state;
+
+       return 0;
+}
+
+static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
+{
+       struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
+
+       return efa_com_destroy_cq(&dev->edev, &params);
+}
+
+int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibcq->device);
+       struct efa_cq *cq = to_ecq(ibcq);
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+               return -EINVAL;
+       }
+
+       ibdev_dbg(&dev->ibdev,
+                 "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
+                 cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
+
+       err = efa_destroy_cq_idx(dev, cq->cq_idx);
+       if (err)
+               return err;
+
+       dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
+                        DMA_FROM_DEVICE);
+
+       kfree(cq);
+       return 0;
+}
+
+static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
+                                struct efa_ibv_create_cq_resp *resp)
+{
+       resp->q_mmap_size = cq->size;
+       resp->q_mmap_key = mmap_entry_insert(dev, cq->ucontext, cq,
+                                            virt_to_phys(cq->cpu_addr),
+                                            cq->size, EFA_MMAP_DMA_PAGE);
+       if (resp->q_mmap_key == EFA_MMAP_INVALID)
+               return -ENOMEM;
+
+       return 0;
+}
+
+static struct ib_cq *do_create_cq(struct ib_device *ibdev, int entries,
+                                 int vector, struct ib_ucontext *ibucontext,
+                                 struct ib_udata *udata)
+{
+       struct efa_ibv_create_cq_resp resp = {};
+       struct efa_com_create_cq_params params;
+       struct efa_com_create_cq_result result;
+       struct efa_dev *dev = to_edev(ibdev);
+       struct efa_ibv_create_cq cmd = {};
+       bool cq_entry_inserted = false;
+       struct efa_cq *cq;
+       int err;
+
+       ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
+
+       if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
+               ibdev_dbg(ibdev,
+                         "cq: requested entries[%u] non-positive or greater than max[%u]\n",
+                         entries, dev->dev_attr.max_cq_depth);
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (!field_avail(cmd, num_sub_cqs, udata->inlen)) {
+               ibdev_dbg(ibdev,
+                         "Incompatible ABI params, no input udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (udata->inlen > sizeof(cmd) &&
+           !ib_is_udata_cleared(udata, sizeof(cmd),
+                                udata->inlen - sizeof(cmd))) {
+               ibdev_dbg(ibdev,
+                         "Incompatible ABI params, unknown fields in udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       err = ib_copy_from_udata(&cmd, udata,
+                                min(sizeof(cmd), udata->inlen));
+       if (err) {
+               ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
+               goto err_out;
+       }
+
+       if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_50)) {
+               ibdev_dbg(ibdev,
+                         "Incompatible ABI params, unknown fields in udata\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (!cmd.cq_entry_size) {
+               ibdev_dbg(ibdev,
+                         "Invalid entry size [%u]\n", cmd.cq_entry_size);
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
+               ibdev_dbg(ibdev,
+                         "Invalid number of sub cqs[%u] expected[%u]\n",
+                         cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+       if (!cq) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+
+       cq->ucontext = to_eucontext(ibucontext);
+       cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
+       cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
+                                        DMA_FROM_DEVICE);
+       if (!cq->cpu_addr) {
+               err = -ENOMEM;
+               goto err_free_cq;
+       }
+
+       params.uarn = cq->ucontext->uarn;
+       params.cq_depth = entries;
+       params.dma_addr = cq->dma_addr;
+       params.entry_size_in_bytes = cmd.cq_entry_size;
+       params.num_sub_cqs = cmd.num_sub_cqs;
+       err = efa_com_create_cq(&dev->edev, &params, &result);
+       if (err)
+               goto err_free_mapped;
+
+       resp.cq_idx = result.cq_idx;
+       cq->cq_idx = result.cq_idx;
+       cq->ibcq.cqe = result.actual_depth;
+       WARN_ON_ONCE(entries != result.actual_depth);
+
+       err = cq_mmap_entries_setup(dev, cq, &resp);
+       if (err) {
+               ibdev_dbg(ibdev,
+                         "Could not setup cq[%u] mmap entries\n", cq->cq_idx);
+               goto err_destroy_cq;
+       }
+
+       cq_entry_inserted = true;
+
+       if (udata->outlen) {
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err) {
+                       ibdev_dbg(ibdev,
+                                 "Failed to copy udata for create_cq\n");
+                       goto err_destroy_cq;
+               }
+       }
+
+       ibdev_dbg(ibdev,
+                 "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
+                 cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
+
+       return &cq->ibcq;
+
+err_destroy_cq:
+       efa_destroy_cq_idx(dev, cq->cq_idx);
+err_free_mapped:
+       dma_unmap_single(&dev->pdev->dev, cq->dma_addr, cq->size,
+                        DMA_FROM_DEVICE);
+       if (!cq_entry_inserted)
+               free_pages_exact(cq->cpu_addr, cq->size);
+err_free_cq:
+       kfree(cq);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.create_cq_err);
+       return ERR_PTR(err);
+}
+
+struct ib_cq *efa_create_cq(struct ib_device *ibdev,
+                           const struct ib_cq_init_attr *attr,
+                           struct ib_udata *udata)
+{
+       struct efa_ucontext *ucontext = rdma_udata_to_drv_context(udata,
+                                                                 struct efa_ucontext,
+                                                                 ibucontext);
+
+       return do_create_cq(ibdev, attr->cqe, attr->comp_vector,
+                           &ucontext->ibucontext, udata);
+}
+
+static int umem_to_page_list(struct efa_dev *dev,
+                            struct ib_umem *umem,
+                            u64 *page_list,
+                            u32 hp_cnt,
+                            u8 hp_shift)
+{
+       u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
+       struct sg_dma_page_iter sg_iter;
+       unsigned int page_idx = 0;
+       unsigned int hp_idx = 0;
+
+       ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
+                 hp_cnt, pages_in_hp);
+
+       for_each_sg_dma_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+               if (page_idx % pages_in_hp == 0) {
+                       page_list[hp_idx] = sg_page_iter_dma_address(&sg_iter);
+                       hp_idx++;
+               }
+
+               page_idx++;
+       }
+
+       return 0;
+}
+
+static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
+{
+       struct scatterlist *sglist;
+       struct page *pg;
+       int i;
+
+       sglist = kcalloc(page_cnt, sizeof(*sglist), GFP_KERNEL);
+       if (!sglist)
+               return NULL;
+       sg_init_table(sglist, page_cnt);
+       for (i = 0; i < page_cnt; i++) {
+               pg = vmalloc_to_page(buf);
+               if (!pg)
+                       goto err;
+               sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
+               buf += PAGE_SIZE / sizeof(*buf);
+       }
+       return sglist;
+
+err:
+       kfree(sglist);
+       return NULL;
+}
+
+/*
+ * create a chunk list of physical pages dma addresses from the supplied
+ * scatter gather list
+ */
+static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
+{
+       unsigned int entry, payloads_in_sg, chunk_list_size, chunk_idx, payload_idx;
+       struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+       int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
+       struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
+       int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
+       struct efa_com_ctrl_buff_info *ctrl_buf;
+       u64 *cur_chunk_buf, *prev_chunk_buf;
+       struct scatterlist *sg;
+       dma_addr_t dma_addr;
+       int i;
+
+       /* allocate a chunk list that consists of 4KB chunks */
+       chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
+
+       chunk_list->size = chunk_list_size;
+       chunk_list->chunks = kcalloc(chunk_list_size,
+                                    sizeof(*chunk_list->chunks),
+                                    GFP_KERNEL);
+       if (!chunk_list->chunks)
+               return -ENOMEM;
+
+       ibdev_dbg(&dev->ibdev,
+                 "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
+                 page_cnt);
+
+       /* allocate chunk buffers: */
+       for (i = 0; i < chunk_list_size; i++) {
+               chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
+               if (!chunk_list->chunks[i].buf)
+                       goto chunk_list_dealloc;
+
+               chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
+       }
+       chunk_list->chunks[chunk_list_size - 1].length =
+               ((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
+                       EFA_CHUNK_PTR_SIZE;
+
+       /* fill the dma addresses of sg list pages to chunks: */
+       chunk_idx = 0;
+       payload_idx = 0;
+       cur_chunk_buf = chunk_list->chunks[0].buf;
+       for_each_sg(pages_sgl, sg, sg_dma_cnt, entry) {
+               payloads_in_sg = sg_dma_len(sg) >> EFA_CHUNK_PAYLOAD_SHIFT;
+               for (i = 0; i < payloads_in_sg; i++) {
+                       cur_chunk_buf[payload_idx++] =
+                               (sg_dma_address(sg) & ~(EFA_CHUNK_PAYLOAD_SIZE - 1)) +
+                               (EFA_CHUNK_PAYLOAD_SIZE * i);
+
+                       if (payload_idx == EFA_PTRS_PER_CHUNK) {
+                               chunk_idx++;
+                               cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
+                               payload_idx = 0;
+                       }
+               }
+       }
+
+       /* map chunks to dma and fill chunks next ptrs */
+       for (i = chunk_list_size - 1; i >= 0; i--) {
+               dma_addr = dma_map_single(&dev->pdev->dev,
+                                         chunk_list->chunks[i].buf,
+                                         chunk_list->chunks[i].length,
+                                         DMA_TO_DEVICE);
+               if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+                       ibdev_err(&dev->ibdev,
+                                 "chunk[%u] dma_map_failed\n", i);
+                       goto chunk_list_unmap;
+               }
+
+               chunk_list->chunks[i].dma_addr = dma_addr;
+               ibdev_dbg(&dev->ibdev,
+                         "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
+
+               if (!i)
+                       break;
+
+               prev_chunk_buf = chunk_list->chunks[i - 1].buf;
+
+               ctrl_buf = (struct efa_com_ctrl_buff_info *)
+                               &prev_chunk_buf[EFA_PTRS_PER_CHUNK];
+               ctrl_buf->length = chunk_list->chunks[i].length;
+
+               efa_com_set_dma_addr(dma_addr,
+                                    &ctrl_buf->address.mem_addr_high,
+                                    &ctrl_buf->address.mem_addr_low);
+       }
+
+       return 0;
+
+chunk_list_unmap:
+       for (; i < chunk_list_size; i++) {
+               dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+                                chunk_list->chunks[i].length, DMA_TO_DEVICE);
+       }
+chunk_list_dealloc:
+       for (i = 0; i < chunk_list_size; i++)
+               kfree(chunk_list->chunks[i].buf);
+
+       kfree(chunk_list->chunks);
+       return -ENOMEM;
+}
+
+static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+       struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
+       int i;
+
+       for (i = 0; i < chunk_list->size; i++) {
+               dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
+                                chunk_list->chunks[i].length, DMA_TO_DEVICE);
+               kfree(chunk_list->chunks[i].buf);
+       }
+
+       kfree(chunk_list->chunks);
+}
+
+/* initialize pbl continuous mode: map pbl buffer to a dma address. */
+static int pbl_continuous_initialize(struct efa_dev *dev,
+                                    struct pbl_context *pbl)
+{
+       dma_addr_t dma_addr;
+
+       dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
+                                 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+       if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
+               ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
+               return -ENOMEM;
+       }
+
+       pbl->phys.continuous.dma_addr = dma_addr;
+       ibdev_dbg(&dev->ibdev,
+                 "pbl continuous - dma_addr = %pad, size[%u]\n",
+                 &dma_addr, pbl->pbl_buf_size_in_bytes);
+
+       return 0;
+}
+
+/*
+ * initialize pbl indirect mode:
+ * create a chunk list out of the dma addresses of the physical pages of
+ * pbl buffer.
+ */
+static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
+{
+       u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
+       struct scatterlist *sgl;
+       int sg_dma_cnt, err;
+
+       BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
+       sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
+       if (!sgl)
+               return -ENOMEM;
+
+       sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+       if (!sg_dma_cnt) {
+               err = -EINVAL;
+               goto err_map;
+       }
+
+       pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
+       pbl->phys.indirect.sgl = sgl;
+       pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
+       err = pbl_chunk_list_create(dev, pbl);
+       if (err) {
+               ibdev_dbg(&dev->ibdev,
+                         "chunk_list creation failed[%d]\n", err);
+               goto err_chunk;
+       }
+
+       ibdev_dbg(&dev->ibdev,
+                 "pbl indirect - size[%u], chunks[%u]\n",
+                 pbl->pbl_buf_size_in_bytes,
+                 pbl->phys.indirect.chunk_list.size);
+
+       return 0;
+
+err_chunk:
+       dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
+err_map:
+       kfree(sgl);
+       return err;
+}
+
+static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
+{
+       pbl_chunk_list_destroy(dev, pbl);
+       dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
+                    pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
+       kfree(pbl->phys.indirect.sgl);
+}
+
+/* create a page buffer list from a mapped user memory region */
+static int pbl_create(struct efa_dev *dev,
+                     struct pbl_context *pbl,
+                     struct ib_umem *umem,
+                     int hp_cnt,
+                     u8 hp_shift)
+{
+       int err;
+
+       pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
+       pbl->pbl_buf = kzalloc(pbl->pbl_buf_size_in_bytes,
+                              GFP_KERNEL | __GFP_NOWARN);
+       if (pbl->pbl_buf) {
+               pbl->physically_continuous = 1;
+               err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+                                       hp_shift);
+               if (err)
+                       goto err_continuous;
+               err = pbl_continuous_initialize(dev, pbl);
+               if (err)
+                       goto err_continuous;
+       } else {
+               pbl->physically_continuous = 0;
+               pbl->pbl_buf = vzalloc(pbl->pbl_buf_size_in_bytes);
+               if (!pbl->pbl_buf)
+                       return -ENOMEM;
+
+               err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
+                                       hp_shift);
+               if (err)
+                       goto err_indirect;
+               err = pbl_indirect_initialize(dev, pbl);
+               if (err)
+                       goto err_indirect;
+       }
+
+       ibdev_dbg(&dev->ibdev,
+                 "user_pbl_created: user_pages[%u], continuous[%u]\n",
+                 hp_cnt, pbl->physically_continuous);
+
+       return 0;
+
+err_continuous:
+       kfree(pbl->pbl_buf);
+       return err;
+err_indirect:
+       vfree(pbl->pbl_buf);
+       return err;
+}
+
+static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
+{
+       if (pbl->physically_continuous) {
+               dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
+                                pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
+               kfree(pbl->pbl_buf);
+       } else {
+               pbl_indirect_terminate(dev, pbl);
+               vfree(pbl->pbl_buf);
+       }
+}
+
+static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
+                                struct efa_com_reg_mr_params *params)
+{
+       int err;
+
+       params->inline_pbl = 1;
+       err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
+                               params->page_num, params->page_shift);
+       if (err)
+               return err;
+
+       ibdev_dbg(&dev->ibdev,
+                 "inline_pbl_array - pages[%u]\n", params->page_num);
+
+       return 0;
+}
+
+static int efa_create_pbl(struct efa_dev *dev,
+                         struct pbl_context *pbl,
+                         struct efa_mr *mr,
+                         struct efa_com_reg_mr_params *params)
+{
+       int err;
+
+       err = pbl_create(dev, pbl, mr->umem, params->page_num,
+                        params->page_shift);
+       if (err) {
+               ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
+               return err;
+       }
+
+       params->inline_pbl = 0;
+       params->indirect = !pbl->physically_continuous;
+       if (pbl->physically_continuous) {
+               params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
+
+               efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
+                                    &params->pbl.pbl.address.mem_addr_high,
+                                    &params->pbl.pbl.address.mem_addr_low);
+       } else {
+               params->pbl.pbl.length =
+                       pbl->phys.indirect.chunk_list.chunks[0].length;
+
+               efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
+                                    &params->pbl.pbl.address.mem_addr_high,
+                                    &params->pbl.pbl.address.mem_addr_low);
+       }
+
+       return 0;
+}
+
+static void efa_cont_pages(struct ib_umem *umem, u64 addr,
+                          unsigned long max_page_shift,
+                          int *count, u8 *shift, u32 *ncont)
+{
+       struct scatterlist *sg;
+       u64 base = ~0, p = 0;
+       unsigned long tmp;
+       unsigned long m;
+       u64 len, pfn;
+       int i = 0;
+       int entry;
+
+       addr = addr >> PAGE_SHIFT;
+       tmp = (unsigned long)addr;
+       m = find_first_bit(&tmp, BITS_PER_LONG);
+       if (max_page_shift)
+               m = min_t(unsigned long, max_page_shift - PAGE_SHIFT, m);
+
+       for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+               len = DIV_ROUND_UP(sg_dma_len(sg), PAGE_SIZE);
+               pfn = sg_dma_address(sg) >> PAGE_SHIFT;
+               if (base + p != pfn) {
+                       /*
+                        * If either the offset or the new
+                        * base are unaligned update m
+                        */
+                       tmp = (unsigned long)(pfn | p);
+                       if (!IS_ALIGNED(tmp, 1 << m))
+                               m = find_first_bit(&tmp, BITS_PER_LONG);
+
+                       base = pfn;
+                       p = 0;
+               }
+
+               p += len;
+               i += len;
+       }
+
+       if (i) {
+               m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+               *ncont = DIV_ROUND_UP(i, (1 << m));
+       } else {
+               m = 0;
+               *ncont = 0;
+       }
+
+       *shift = PAGE_SHIFT + m;
+       *count = i;
+}
+
+struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
+                        u64 virt_addr, int access_flags,
+                        struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibpd->device);
+       struct efa_com_reg_mr_params params = {};
+       struct efa_com_reg_mr_result result = {};
+       unsigned long max_page_shift;
+       struct pbl_context pbl;
+       struct efa_mr *mr;
+       int inline_size;
+       int npages;
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
+               ibdev_dbg(&dev->ibdev,
+                         "Incompatible ABI params, udata not cleared\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       if (access_flags & ~EFA_SUPPORTED_ACCESS_FLAGS) {
+               ibdev_dbg(&dev->ibdev,
+                         "Unsupported access flags[%#x], supported[%#x]\n",
+                         access_flags, EFA_SUPPORTED_ACCESS_FLAGS);
+               err = -EOPNOTSUPP;
+               goto err_out;
+       }
+
+       mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+       if (!mr) {
+               err = -ENOMEM;
+               goto err_out;
+       }
+
+       mr->umem = ib_umem_get(udata, start, length, access_flags, 0);
+       if (IS_ERR(mr->umem)) {
+               err = PTR_ERR(mr->umem);
+               ibdev_dbg(&dev->ibdev,
+                         "Failed to pin and map user space memory[%d]\n", err);
+               goto err_free;
+       }
+
+       params.pd = to_epd(ibpd)->pdn;
+       params.iova = virt_addr;
+       params.mr_length_in_bytes = length;
+       params.permissions = access_flags & 0x1;
+       max_page_shift = fls64(dev->dev_attr.page_size_cap);
+
+       efa_cont_pages(mr->umem, start, max_page_shift, &npages,
+                      &params.page_shift, &params.page_num);
+       ibdev_dbg(&dev->ibdev,
+                 "start %#llx length %#llx npages %d params.page_shift %u params.page_num %u\n",
+                 start, length, npages, params.page_shift, params.page_num);
+
+       inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
+       if (params.page_num <= inline_size) {
+               err = efa_create_inline_pbl(dev, mr, &params);
+               if (err)
+                       goto err_unmap;
+
+               err = efa_com_register_mr(&dev->edev, &params, &result);
+               if (err)
+                       goto err_unmap;
+       } else {
+               err = efa_create_pbl(dev, &pbl, mr, &params);
+               if (err)
+                       goto err_unmap;
+
+               err = efa_com_register_mr(&dev->edev, &params, &result);
+               pbl_destroy(dev, &pbl);
+
+               if (err)
+                       goto err_unmap;
+       }
+
+       mr->ibmr.lkey = result.l_key;
+       mr->ibmr.rkey = result.r_key;
+       mr->ibmr.length = length;
+       ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
+
+       return &mr->ibmr;
+
+err_unmap:
+       ib_umem_release(mr->umem);
+err_free:
+       kfree(mr);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.reg_mr_err);
+       return ERR_PTR(err);
+}
+
+int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibmr->device);
+       struct efa_com_dereg_mr_params params;
+       struct efa_mr *mr = to_emr(ibmr);
+       int err;
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+               return -EINVAL;
+       }
+
+       ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
+
+       if (mr->umem) {
+               params.l_key = mr->ibmr.lkey;
+               err = efa_com_dereg_mr(&dev->edev, &params);
+               if (err)
+                       return err;
+               ib_umem_release(mr->umem);
+       }
+
+       kfree(mr);
+
+       return 0;
+}
+
+int efa_get_port_immutable(struct ib_device *ibdev, u8 port_num,
+                          struct ib_port_immutable *immutable)
+{
+       struct ib_port_attr attr;
+       int err;
+
+       err = ib_query_port(ibdev, port_num, &attr);
+       if (err) {
+               ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
+               return err;
+       }
+
+       immutable->pkey_tbl_len = attr.pkey_tbl_len;
+       immutable->gid_tbl_len = attr.gid_tbl_len;
+
+       return 0;
+}
+
+static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
+{
+       struct efa_com_dealloc_uar_params params = {
+               .uarn = uarn,
+       };
+
+       return efa_com_dealloc_uar(&dev->edev, &params);
+}
+
+int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
+{
+       struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+       struct efa_dev *dev = to_edev(ibucontext->device);
+       struct efa_ibv_alloc_ucontext_resp resp = {};
+       struct efa_com_alloc_uar_result result;
+       int err;
+
+       /*
+        * it's fine if the driver does not know all request fields,
+        * we will ack input fields in our response.
+        */
+
+       err = efa_com_alloc_uar(&dev->edev, &result);
+       if (err)
+               goto err_out;
+
+       ucontext->uarn = result.uarn;
+       xa_init(&ucontext->mmap_xa);
+
+       resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
+       resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
+       resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
+       resp.inline_buf_size = dev->dev_attr.inline_buf_size;
+       resp.max_llq_size = dev->dev_attr.max_llq_size;
+
+       if (udata && udata->outlen) {
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err)
+                       goto err_dealloc_uar;
+       }
+
+       return 0;
+
+err_dealloc_uar:
+       efa_dealloc_uar(dev, result.uarn);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.alloc_ucontext_err);
+       return err;
+}
+
+void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
+{
+       struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+       struct efa_dev *dev = to_edev(ibucontext->device);
+
+       mmap_entries_remove_free(dev, ucontext);
+       efa_dealloc_uar(dev, ucontext->uarn);
+}
+
+static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
+                     struct vm_area_struct *vma, u64 key, u64 length)
+{
+       struct efa_mmap_entry *entry;
+       unsigned long va;
+       u64 pfn;
+       int err;
+
+       entry = mmap_entry_get(dev, ucontext, key, length);
+       if (!entry) {
+               ibdev_dbg(&dev->ibdev, "key[%#llx] does not have valid entry\n",
+                         key);
+               return -EINVAL;
+       }
+
+       ibdev_dbg(&dev->ibdev,
+                 "Mapping address[%#llx], length[%#llx], mmap_flag[%d]\n",
+                 entry->address, length, entry->mmap_flag);
+
+       pfn = entry->address >> PAGE_SHIFT;
+       switch (entry->mmap_flag) {
+       case EFA_MMAP_IO_NC:
+               err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
+                                       pgprot_noncached(vma->vm_page_prot));
+               break;
+       case EFA_MMAP_IO_WC:
+               err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn, length,
+                                       pgprot_writecombine(vma->vm_page_prot));
+               break;
+       case EFA_MMAP_DMA_PAGE:
+               for (va = vma->vm_start; va < vma->vm_end;
+                    va += PAGE_SIZE, pfn++) {
+                       err = vm_insert_page(vma, va, pfn_to_page(pfn));
+                       if (err)
+                               break;
+               }
+               break;
+       default:
+               err = -EINVAL;
+       }
+
+       if (err)
+               ibdev_dbg(
+                       &dev->ibdev,
+                       "Couldn't mmap address[%#llx] length[%#llx] mmap_flag[%d] err[%d]\n",
+                       entry->address, length, entry->mmap_flag, err);
+
+       return err;
+}
+
+int efa_mmap(struct ib_ucontext *ibucontext,
+            struct vm_area_struct *vma)
+{
+       struct efa_ucontext *ucontext = to_eucontext(ibucontext);
+       struct efa_dev *dev = to_edev(ibucontext->device);
+       u64 length = vma->vm_end - vma->vm_start;
+       u64 key = vma->vm_pgoff << PAGE_SHIFT;
+
+       ibdev_dbg(&dev->ibdev,
+                 "start %#lx, end %#lx, length = %#llx, key = %#llx\n",
+                 vma->vm_start, vma->vm_end, length, key);
+
+       if (length % PAGE_SIZE != 0 || !(vma->vm_flags & VM_SHARED)) {
+               ibdev_dbg(&dev->ibdev,
+                         "length[%#llx] is not page size aligned[%#lx] or VM_SHARED is not set [%#lx]\n",
+                         length, PAGE_SIZE, vma->vm_flags);
+               return -EINVAL;
+       }
+
+       if (vma->vm_flags & VM_EXEC) {
+               ibdev_dbg(&dev->ibdev, "Mapping executable pages is not permitted\n");
+               return -EPERM;
+       }
+       vma->vm_flags &= ~VM_MAYEXEC;
+
+       return __efa_mmap(dev, ucontext, vma, key, length);
+}
+
+static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
+{
+       struct efa_com_destroy_ah_params params = {
+               .ah = ah->ah,
+               .pdn = to_epd(ah->ibah.pd)->pdn,
+       };
+
+       return efa_com_destroy_ah(&dev->edev, &params);
+}
+
+int efa_create_ah(struct ib_ah *ibah,
+                 struct rdma_ah_attr *ah_attr,
+                 u32 flags,
+                 struct ib_udata *udata)
+{
+       struct efa_dev *dev = to_edev(ibah->device);
+       struct efa_com_create_ah_params params = {};
+       struct efa_ibv_create_ah_resp resp = {};
+       struct efa_com_create_ah_result result;
+       struct efa_ah *ah = to_eah(ibah);
+       int err;
+
+       if (!(flags & RDMA_CREATE_AH_SLEEPABLE)) {
+               ibdev_dbg(&dev->ibdev,
+                         "Create address handle is not supported in atomic context\n");
+               err = -EOPNOTSUPP;
+               goto err_out;
+       }
+
+       if (udata->inlen &&
+           !ib_is_udata_cleared(udata, 0, udata->inlen)) {
+               ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
+               err = -EINVAL;
+               goto err_out;
+       }
+
+       memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
+              sizeof(params.dest_addr));
+       params.pdn = to_epd(ibah->pd)->pdn;
+       err = efa_com_create_ah(&dev->edev, &params, &result);
+       if (err)
+               goto err_out;
+
+       memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
+       ah->ah = result.ah;
+
+       resp.efa_address_handle = result.ah;
+
+       if (udata->outlen) {
+               err = ib_copy_to_udata(udata, &resp,
+                                      min(sizeof(resp), udata->outlen));
+               if (err) {
+                       ibdev_dbg(&dev->ibdev,
+                                 "Failed to copy udata for create_ah response\n");
+                       goto err_destroy_ah;
+               }
+       }
+       ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
+
+       return 0;
+
+err_destroy_ah:
+       efa_ah_destroy(dev, ah);
+err_out:
+       atomic64_inc(&dev->stats.sw_stats.create_ah_err);
+       return err;
+}
+
+void efa_destroy_ah(struct ib_ah *ibah, u32 flags)
+{
+       struct efa_dev *dev = to_edev(ibah->pd->device);
+       struct efa_ah *ah = to_eah(ibah);
+
+       ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
+
+       if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
+               ibdev_dbg(&dev->ibdev,
+                         "Destroy address handle is not supported in atomic context\n");
+               return;
+       }
+
+       efa_ah_destroy(dev, ah);
+}
+
+enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
+                                        u8 port_num)
+{
+       return IB_LINK_LAYER_UNSPECIFIED;
+}
+
index addefae..310105d 100644 (file)
@@ -4104,6 +4104,9 @@ def_access_ibp_counter(seq_naks);
 
 static struct cntr_entry dev_cntrs[DEV_CNTR_LAST] = {
 [C_RCV_OVF] = RXE32_DEV_CNTR_ELEM(RcvOverflow, RCV_BUF_OVFL_CNT, CNTR_SYNTH),
+[C_RX_LEN_ERR] = RXE32_DEV_CNTR_ELEM(RxLenErr, RCV_LENGTH_ERR_CNT, CNTR_SYNTH),
+[C_RX_ICRC_ERR] = RXE32_DEV_CNTR_ELEM(RxICrcErr, RCV_ICRC_ERR_CNT, CNTR_SYNTH),
+[C_RX_EBP] = RXE32_DEV_CNTR_ELEM(RxEbpCnt, RCV_EBP_CNT, CNTR_SYNTH),
 [C_RX_TID_FULL] = RXE32_DEV_CNTR_ELEM(RxTIDFullEr, RCV_TID_FULL_ERR_CNT,
                        CNTR_NORMAL),
 [C_RX_TID_INVALID] = RXE32_DEV_CNTR_ELEM(RxTIDInvalid, RCV_TID_VALID_ERR_CNT,
@@ -13294,15 +13297,18 @@ static int set_up_context_variables(struct hfi1_devdata *dd)
        /*
         * The RMT entries are currently allocated as shown below:
         * 1. QOS (0 to 128 entries);
-        * 2. FECN for PSM (num_user_contexts + num_vnic_contexts);
+        * 2. FECN (num_kernel_context - 1 + num_user_contexts +
+        *    num_vnic_contexts);
         * 3. VNIC (num_vnic_contexts).
-        * It should be noted that PSM FECN oversubscribe num_vnic_contexts
+        * It should be noted that FECN oversubscribe num_vnic_contexts
         * entries of RMT because both VNIC and PSM could allocate any receive
         * context between dd->first_dyn_alloc_text and dd->num_rcv_contexts,
         * and PSM FECN must reserve an RMT entry for each possible PSM receive
         * context.
         */
        rmt_count = qos_rmt_entries(dd, NULL, NULL) + (num_vnic_contexts * 2);
+       if (HFI1_CAP_IS_KSET(TID_RDMA))
+               rmt_count += num_kernel_contexts - 1;
        if (rmt_count + n_usr_ctxts > NUM_MAP_ENTRIES) {
                user_rmt_reduced = NUM_MAP_ENTRIES - rmt_count;
                dd_dev_err(dd,
@@ -14285,37 +14291,43 @@ bail:
        init_qpmap_table(dd, FIRST_KERNEL_KCTXT, dd->n_krcv_queues - 1);
 }
 
-static void init_user_fecn_handling(struct hfi1_devdata *dd,
-                                   struct rsm_map_table *rmt)
+static void init_fecn_handling(struct hfi1_devdata *dd,
+                              struct rsm_map_table *rmt)
 {
        struct rsm_rule_data rrd;
        u64 reg;
-       int i, idx, regoff, regidx;
+       int i, idx, regoff, regidx, start;
        u8 offset;
        u32 total_cnt;
 
+       if (HFI1_CAP_IS_KSET(TID_RDMA))
+               /* Exclude context 0 */
+               start = 1;
+       else
+               start = dd->first_dyn_alloc_ctxt;
+
+       total_cnt = dd->num_rcv_contexts - start;
+
        /* there needs to be enough room in the map table */
-       total_cnt = dd->num_rcv_contexts - dd->first_dyn_alloc_ctxt;
        if (rmt->used + total_cnt >= NUM_MAP_ENTRIES) {
-               dd_dev_err(dd, "User FECN handling disabled - too many user contexts allocated\n");
+               dd_dev_err(dd, "FECN handling disabled - too many contexts allocated\n");
                return;
        }
 
        /*
         * RSM will extract the destination context as an index into the
         * map table.  The destination contexts are a sequential block
-        * in the range first_dyn_alloc_ctxt...num_rcv_contexts-1 (inclusive).
+        * in the range start...num_rcv_contexts-1 (inclusive).
         * Map entries are accessed as offset + extracted value.  Adjust
         * the added offset so this sequence can be placed anywhere in
         * the table - as long as the entries themselves do not wrap.
         * There are only enough bits in offset for the table size, so
         * start with that to allow for a "negative" offset.
         */
-       offset = (u8)(NUM_MAP_ENTRIES + (int)rmt->used -
-                                               (int)dd->first_dyn_alloc_ctxt);
+       offset = (u8)(NUM_MAP_ENTRIES + rmt->used - start);
 
-       for (i = dd->first_dyn_alloc_ctxt, idx = rmt->used;
-                               i < dd->num_rcv_contexts; i++, idx++) {
+       for (i = start, idx = rmt->used; i < dd->num_rcv_contexts;
+            i++, idx++) {
                /* replace with identity mapping */
                regoff = (idx % 8) * 8;
                regidx = idx / 8;
@@ -14437,7 +14449,7 @@ static void init_rxe(struct hfi1_devdata *dd)
        rmt = alloc_rsm_map_table(dd);
        /* set up QOS, including the QPN map table */
        init_qos(dd, rmt);
-       init_user_fecn_handling(dd, rmt);
+       init_fecn_handling(dd, rmt);
        complete_rsm_map_table(dd, rmt);
        /* record number of used rsm map entries for vnic */
        dd->vnic.rmt_start = rmt->used;
@@ -14663,8 +14675,8 @@ void hfi1_start_cleanup(struct hfi1_devdata *dd)
  */
 static int init_asic_data(struct hfi1_devdata *dd)
 {
-       unsigned long flags;
-       struct hfi1_devdata *tmp, *peer = NULL;
+       unsigned long index;
+       struct hfi1_devdata *peer;
        struct hfi1_asic_data *asic_data;
        int ret = 0;
 
@@ -14673,14 +14685,12 @@ static int init_asic_data(struct hfi1_devdata *dd)
        if (!asic_data)
                return -ENOMEM;
 
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
+       xa_lock_irq(&hfi1_dev_table);
        /* Find our peer device */
-       list_for_each_entry(tmp, &hfi1_dev_list, list) {
-               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(tmp)) &&
-                   dd->unit != tmp->unit) {
-                       peer = tmp;
+       xa_for_each(&hfi1_dev_table, index, peer) {
+               if ((HFI_BASE_GUID(dd) == HFI_BASE_GUID(peer)) &&
+                   dd->unit != peer->unit)
                        break;
-               }
        }
 
        if (peer) {
@@ -14692,7 +14702,7 @@ static int init_asic_data(struct hfi1_devdata *dd)
                mutex_init(&dd->asic_data->asic_resource_mutex);
        }
        dd->asic_data->dds[dd->hfi1_id] = dd; /* self back-pointer */
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       xa_unlock_irq(&hfi1_dev_table);
 
        /* first one through - set up i2c devices */
        if (!peer)
index 6c27c1c..4e6c355 100644 (file)
@@ -858,6 +858,9 @@ static inline int idx_from_vl(int vl)
 /* Per device counter indexes */
 enum {
        C_RCV_OVF = 0,
+       C_RX_LEN_ERR,
+       C_RX_ICRC_ERR,
+       C_RX_EBP,
        C_RX_TID_FULL,
        C_RX_TID_INVALID,
        C_RX_TID_FLGMS,
index c0800ea..ab3589d 100644 (file)
 #define DC_LCB_PRF_TX_FLIT_CNT (DC_LCB_CSRS + 0x000000000418)
 #define DC_LCB_STS_LINK_TRANSFER_ACTIVE (DC_LCB_CSRS + 0x000000000468)
 #define DC_LCB_STS_ROUND_TRIP_LTP_CNT (DC_LCB_CSRS + 0x0000000004B0)
+#define RCV_LENGTH_ERR_CNT 0
+#define RCV_ICRC_ERR_CNT 6
+#define RCV_EBP_CNT 9
 #define RCV_BUF_OVFL_CNT 10
 #define RCV_CONTEXT_EGR_STALL 22
 #define RCV_DATA_PKT_CNT 0
index 7310a5d..d47da7b 100644 (file)
@@ -286,7 +286,7 @@ struct diag_pkt {
 #define RHF_TID_ERR            (0x1ull << 59)
 #define RHF_LEN_ERR            (0x1ull << 60)
 #define RHF_ECC_ERR            (0x1ull << 61)
-#define RHF_VCRC_ERR           (0x1ull << 62)
+#define RHF_RESERVED           (0x1ull << 62)
 #define RHF_ICRC_ERR           (0x1ull << 63)
 
 #define RHF_ERROR_SMASK 0xffe0000000000000ull          /* bits 63:53 */
index 427ba0c..15efb4a 100644 (file)
@@ -1080,6 +1080,77 @@ static int qsfp2_debugfs_release(struct inode *in, struct file *fp)
        return __qsfp_debugfs_release(in, fp, 1);
 }
 
+#define EXPROM_WRITE_ENABLE BIT_ULL(14)
+
+static bool exprom_wp_disabled;
+
+static int exprom_wp_set(struct hfi1_devdata *dd, bool disable)
+{
+       u64 gpio_val = 0;
+
+       if (disable) {
+               gpio_val = EXPROM_WRITE_ENABLE;
+               exprom_wp_disabled = true;
+               dd_dev_info(dd, "Disable Expansion ROM Write Protection\n");
+       } else {
+               exprom_wp_disabled = false;
+               dd_dev_info(dd, "Enable Expansion ROM Write Protection\n");
+       }
+
+       write_csr(dd, ASIC_GPIO_OUT, gpio_val);
+       write_csr(dd, ASIC_GPIO_OE, gpio_val);
+
+       return 0;
+}
+
+static ssize_t exprom_wp_debugfs_read(struct file *file, char __user *buf,
+                                     size_t count, loff_t *ppos)
+{
+       return 0;
+}
+
+static ssize_t exprom_wp_debugfs_write(struct file *file,
+                                      const char __user *buf, size_t count,
+                                      loff_t *ppos)
+{
+       struct hfi1_pportdata *ppd = private2ppd(file);
+       char cdata;
+
+       if (count != 1)
+               return -EINVAL;
+       if (get_user(cdata, buf))
+               return -EFAULT;
+       if (cdata == '0')
+               exprom_wp_set(ppd->dd, false);
+       else if (cdata == '1')
+               exprom_wp_set(ppd->dd, true);
+       else
+               return -EINVAL;
+
+       return 1;
+}
+
+static unsigned long exprom_in_use;
+
+static int exprom_wp_debugfs_open(struct inode *in, struct file *fp)
+{
+       if (test_and_set_bit(0, &exprom_in_use))
+               return -EBUSY;
+
+       return 0;
+}
+
+static int exprom_wp_debugfs_release(struct inode *in, struct file *fp)
+{
+       struct hfi1_pportdata *ppd = private2ppd(fp);
+
+       if (exprom_wp_disabled)
+               exprom_wp_set(ppd->dd, false);
+       clear_bit(0, &exprom_in_use);
+
+       return 0;
+}
+
 #define DEBUGFS_OPS(nm, readroutine, writeroutine)     \
 { \
        .name = nm, \
@@ -1119,6 +1190,9 @@ static const struct counter_info port_cntr_ops[] = {
                     qsfp1_debugfs_open, qsfp1_debugfs_release),
        DEBUGFS_XOPS("qsfp2", qsfp2_debugfs_read, qsfp2_debugfs_write,
                     qsfp2_debugfs_open, qsfp2_debugfs_release),
+       DEBUGFS_XOPS("exprom_wp", exprom_wp_debugfs_read,
+                    exprom_wp_debugfs_write, exprom_wp_debugfs_open,
+                    exprom_wp_debugfs_release),
        DEBUGFS_OPS("asic_flags", asic_flags_read, asic_flags_write),
        DEBUGFS_OPS("dc8051_memory", dc8051_memory_read, NULL),
        DEBUGFS_OPS("lcb", debugfs_lcb_read, debugfs_lcb_write),
@@ -1302,15 +1376,15 @@ static void _driver_stats_seq_stop(struct seq_file *s, void *v)
 
 static u64 hfi1_sps_ints(void)
 {
-       unsigned long flags;
+       unsigned long index, flags;
        struct hfi1_devdata *dd;
        u64 sps_ints = 0;
 
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
+       xa_lock_irqsave(&hfi1_dev_table, flags);
+       xa_for_each(&hfi1_dev_table, index, dd) {
                sps_ints += get_all_cpu_total(dd->int_counter);
        }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       xa_unlock_irqrestore(&hfi1_dev_table, flags);
        return sps_ints;
 }
 
index 2a9d291..01aa1f1 100644 (file)
@@ -72,8 +72,6 @@
  */
 const char ib_hfi1_version[] = HFI1_DRIVER_VERSION "\n";
 
-DEFINE_SPINLOCK(hfi1_devs_lock);
-LIST_HEAD(hfi1_dev_list);
 DEFINE_MUTEX(hfi1_mutex);      /* general driver use */
 
 unsigned int hfi1_max_mtu = HFI1_DEFAULT_MAX_MTU;
@@ -175,11 +173,11 @@ int hfi1_count_active_units(void)
 {
        struct hfi1_devdata *dd;
        struct hfi1_pportdata *ppd;
-       unsigned long flags;
+       unsigned long index, flags;
        int pidx, nunits_active = 0;
 
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
+       xa_lock_irqsave(&hfi1_dev_table, flags);
+       xa_for_each(&hfi1_dev_table, index, dd) {
                if (!(dd->flags & HFI1_PRESENT) || !dd->kregbase1)
                        continue;
                for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -190,7 +188,7 @@ int hfi1_count_active_units(void)
                        }
                }
        }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       xa_unlock_irqrestore(&hfi1_dev_table, flags);
        return nunits_active;
 }
 
@@ -264,7 +262,7 @@ static void rcv_hdrerr(struct hfi1_ctxtdata *rcd, struct hfi1_pportdata *ppd,
            hfi1_dbg_fault_suppress_err(verbs_dev))
                return;
 
-       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+       if (packet->rhf & RHF_ICRC_ERR)
                return;
 
        if (packet->etype == RHF_RCV_TYPE_BYPASS) {
@@ -516,7 +514,9 @@ bool hfi1_process_ecn_slowpath(struct rvt_qp *qp, struct hfi1_packet *pkt,
         */
        do_cnp = prescan ||
                (opcode >= IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST &&
-                opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE);
+                opcode <= IB_OPCODE_RC_ATOMIC_ACKNOWLEDGE) ||
+               opcode == TID_OP(READ_RESP) ||
+               opcode == TID_OP(ACK);
 
        /* Call appropriate CNP handler */
        if (!ignore_fecn && do_cnp && fecn)
@@ -1581,7 +1581,7 @@ static void show_eflags_errs(struct hfi1_packet *packet)
        u32 rte = rhf_rcv_type_err(packet->rhf);
 
        dd_dev_err(rcd->dd,
-                  "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s%s] rte 0x%x\n",
+                  "receive context %d: rhf 0x%016llx, errs [ %s%s%s%s%s%s%s] rte 0x%x\n",
                   rcd->ctxt, packet->rhf,
                   packet->rhf & RHF_K_HDR_LEN_ERR ? "k_hdr_len " : "",
                   packet->rhf & RHF_DC_UNC_ERR ? "dc_unc " : "",
@@ -1589,7 +1589,6 @@ static void show_eflags_errs(struct hfi1_packet *packet)
                   packet->rhf & RHF_TID_ERR ? "tid " : "",
                   packet->rhf & RHF_LEN_ERR ? "len " : "",
                   packet->rhf & RHF_ECC_ERR ? "ecc " : "",
-                  packet->rhf & RHF_VCRC_ERR ? "vcrc " : "",
                   packet->rhf & RHF_ICRC_ERR ? "icrc " : "",
                   rte);
 }
index 1be49a0..e9d5cc8 100644 (file)
@@ -112,9 +112,6 @@ int hfi1_alloc_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
  */
 void hfi1_free_ctxt_rcv_groups(struct hfi1_ctxtdata *rcd)
 {
-       WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_full_list));
-       WARN_ON(!EXP_TID_SET_EMPTY(rcd->tid_used_list));
-
        kfree(rcd->groups);
        rcd->groups = NULL;
        hfi1_exp_tid_group_init(rcd);
index 048b5d7..b458c21 100644 (file)
@@ -54,7 +54,6 @@
 #include <linux/list.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
-#include <linux/idr.h>
 #include <linux/io.h>
 #include <linux/fs.h>
 #include <linux/completion.h>
@@ -65,6 +64,7 @@
 #include <linux/kthread.h>
 #include <linux/i2c.h>
 #include <linux/i2c-algo-bit.h>
+#include <linux/xarray.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/opa_addr.h>
 #include <linux/rhashtable.h>
@@ -1021,8 +1021,8 @@ struct hfi1_asic_data {
 struct hfi1_vnic_data {
        struct hfi1_ctxtdata *ctxt[HFI1_NUM_VNIC_CTXT];
        struct kmem_cache *txreq_cache;
+       struct xarray vesws;
        u8 num_vports;
-       struct idr vesw_idr;
        u8 rmt_start;
        u8 num_ctxt;
 };
@@ -1041,7 +1041,6 @@ struct sdma_vl_map;
 typedef int (*send_routine)(struct rvt_qp *, struct hfi1_pkt_state *, u64);
 struct hfi1_devdata {
        struct hfi1_ibdev verbs_dev;     /* must be first */
-       struct list_head list;
        /* pointers to related structs for this device */
        /* pci access data structure */
        struct pci_dev *pcidev;
@@ -1426,8 +1425,7 @@ struct hfi1_filedata {
        struct mm_struct *mm;
 };
 
-extern struct list_head hfi1_dev_list;
-extern spinlock_t hfi1_devs_lock;
+extern struct xarray hfi1_dev_table;
 struct hfi1_devdata *hfi1_lookup(int unit);
 
 static inline unsigned long uctxt_offset(struct hfi1_ctxtdata *uctxt)
index faaaac8..71cb952 100644 (file)
@@ -49,7 +49,7 @@
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <linux/module.h>
 #include <linux/printk.h>
 #include <linux/hrtimer.h>
@@ -124,7 +124,7 @@ MODULE_PARM_DESC(user_credit_return_threshold, "Credit return threshold for user
 
 static inline u64 encode_rcv_header_entry_size(u16 size);
 
-static struct idr hfi1_unit_table;
+DEFINE_XARRAY_FLAGS(hfi1_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 
 static int hfi1_create_kctxt(struct hfi1_devdata *dd,
                             struct hfi1_pportdata *ppd)
@@ -469,7 +469,7 @@ int hfi1_create_ctxtdata(struct hfi1_pportdata *ppd, int numa,
                if (rcd->egrbufs.size < hfi1_max_mtu) {
                        rcd->egrbufs.size = __roundup_pow_of_two(hfi1_max_mtu);
                        hfi1_cdbg(PROC,
-                                 "ctxt%u: eager bufs size too small. Adjusting to %zu\n",
+                                 "ctxt%u: eager bufs size too small. Adjusting to %u\n",
                                    rcd->ctxt, rcd->egrbufs.size);
                }
                rcd->egrbufs.rcvtid_size = HFI1_MAX_EAGER_BUFFER_SIZE;
@@ -805,7 +805,8 @@ static int create_workqueues(struct hfi1_devdata *dd)
                        ppd->hfi1_wq =
                                alloc_workqueue(
                                    "hfi%d_%d",
-                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
+                                   WQ_SYSFS | WQ_HIGHPRI | WQ_CPU_INTENSIVE |
+                                   WQ_MEM_RECLAIM,
                                    HFI1_MAX_ACTIVE_WORKQUEUE_ENTRIES,
                                    dd->unit, pidx);
                        if (!ppd->hfi1_wq)
@@ -1018,21 +1019,9 @@ done:
        return ret;
 }
 
-static inline struct hfi1_devdata *__hfi1_lookup(int unit)
-{
-       return idr_find(&hfi1_unit_table, unit);
-}
-
 struct hfi1_devdata *hfi1_lookup(int unit)
 {
-       struct hfi1_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       dd = __hfi1_lookup(unit);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-
-       return dd;
+       return xa_load(&hfi1_dev_table, unit);
 }
 
 /*
@@ -1200,7 +1189,7 @@ void hfi1_free_ctxtdata(struct hfi1_devdata *dd, struct hfi1_ctxtdata *rcd)
 /*
  * Release our hold on the shared asic data.  If we are the last one,
  * return the structure to be finalized outside the lock.  Must be
- * holding hfi1_devs_lock.
+ * holding hfi1_dev_table lock.
  */
 static struct hfi1_asic_data *release_asic_data(struct hfi1_devdata *dd)
 {
@@ -1236,13 +1225,10 @@ static void hfi1_clean_devdata(struct hfi1_devdata *dd)
        struct hfi1_asic_data *ad;
        unsigned long flags;
 
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       if (!list_empty(&dd->list)) {
-               idr_remove(&hfi1_unit_table, dd->unit);
-               list_del_init(&dd->list);
-       }
+       xa_lock_irqsave(&hfi1_dev_table, flags);
+       __xa_erase(&hfi1_dev_table, dd->unit);
        ad = release_asic_data(dd);
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       xa_unlock_irqrestore(&hfi1_dev_table, flags);
 
        finalize_asic_data(dd, ad);
        free_platform_config(dd);
@@ -1286,13 +1272,10 @@ void hfi1_free_devdata(struct hfi1_devdata *dd)
  * Must be done via verbs allocator, because the verbs cleanup process
  * both does cleanup and free of the data structure.
  * "extra" is for chip-specific data.
- *
- * Use the idr mechanism to get a unit number for this unit.
  */
 static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
                                               size_t extra)
 {
-       unsigned long flags;
        struct hfi1_devdata *dd;
        int ret, nports;
 
@@ -1307,21 +1290,10 @@ static struct hfi1_devdata *hfi1_alloc_devdata(struct pci_dev *pdev,
        dd->pport = (struct hfi1_pportdata *)(dd + 1);
        dd->pcidev = pdev;
        pci_set_drvdata(pdev, dd);
-
-       INIT_LIST_HEAD(&dd->list);
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-
-       ret = idr_alloc(&hfi1_unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret >= 0) {
-               dd->unit = ret;
-               list_add(&dd->list, &hfi1_dev_list);
-       }
        dd->node = NUMA_NO_NODE;
 
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
-       idr_preload_end();
-
+       ret = xa_alloc_irq(&hfi1_dev_table, &dd->unit, dd, xa_limit_32b,
+                       GFP_KERNEL);
        if (ret < 0) {
                dev_err(&pdev->dev,
                        "Could not allocate unit ID: error %d\n", -ret);
@@ -1522,8 +1494,6 @@ static int __init hfi1_mod_init(void)
         * These must be called before the driver is registered with
         * the PCI subsystem.
         */
-       idr_init(&hfi1_unit_table);
-
        hfi1_dbg_init();
        ret = pci_register_driver(&hfi1_pci_driver);
        if (ret < 0) {
@@ -1534,7 +1504,6 @@ static int __init hfi1_mod_init(void)
 
 bail_dev:
        hfi1_dbg_exit();
-       idr_destroy(&hfi1_unit_table);
        dev_cleanup();
 bail:
        return ret;
@@ -1552,7 +1521,7 @@ static void __exit hfi1_mod_cleanup(void)
        node_affinity_destroy_all();
        hfi1_dbg_exit();
 
-       idr_destroy(&hfi1_unit_table);
+       WARN_ON(!xa_empty(&hfi1_dev_table));
        dispose_firmware();     /* asymmetric with obtain_firmware() */
        dev_cleanup();
 }
@@ -2071,7 +2040,7 @@ int hfi1_setup_eagerbufs(struct hfi1_ctxtdata *rcd)
        rcd->egrbufs.size = alloced_bytes;
 
        hfi1_cdbg(PROC,
-                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %zuKB\n",
+                 "ctxt%u: Alloced %u rcv tid entries @ %uKB, total %uKB\n",
                  rcd->ctxt, rcd->egrbufs.alloced,
                  rcd->egrbufs.rcvtid_size / 1024, rcd->egrbufs.size / 1024);
 
index 5f2011c..62f93c1 100644 (file)
  * for future transactions
  */
 
+#include <linux/workqueue.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdmavt_qp.h>
+
 /* STL Verbs Extended */
 #define IB_BTHE_E_SHIFT           24
 #define HFI1_VERBS_E_ATOMIC_VADDR U64_MAX
 
-struct ib_atomic_eth;
-
 enum hfi1_opfn_codes {
        STL_VERBS_EXTD_NONE = 0,
        STL_VERBS_EXTD_TID_RDMA,
index eba3003..4e0e9fc 100644 (file)
@@ -742,6 +742,8 @@ void *qp_priv_alloc(struct rvt_dev_info *rdi, struct rvt_qp *qp)
                iowait_wakeup,
                iowait_sdma_drained,
                hfi1_init_priority);
+       /* Init to a value to start the running average correctly */
+       priv->s_running_pkt_size = piothreshold / 2;
        return priv;
 }
 
index 5991211..a922edc 100644 (file)
@@ -140,10 +140,7 @@ static int make_rc_ack(struct hfi1_ibdev *dev, struct rvt_qp *qp,
        case OP(RDMA_READ_RESPONSE_LAST):
        case OP(RDMA_READ_RESPONSE_ONLY):
                e = &qp->s_ack_queue[qp->s_tail_ack_queue];
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
+               release_rdma_sge_mr(e);
                /* FALLTHROUGH */
        case OP(ATOMIC_ACKNOWLEDGE):
                /*
@@ -343,7 +340,8 @@ write_resp:
                        break;
 
                e->sent = 1;
-               qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+               /* Do not free e->rdma_sge until all data are received */
+               qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
                break;
 
        case TID_OP(READ_RESP):
@@ -1836,7 +1834,7 @@ void hfi1_rc_send_complete(struct rvt_qp *qp, struct hfi1_opa_header *opah)
                qp->s_last = s_last;
                /* see post_send() */
                barrier();
-               rvt_put_swqe(wqe);
+               rvt_put_qp_swqe(qp, wqe);
                rvt_qp_swqe_complete(qp,
                                     wqe,
                                     ib_hfi1_wc_opcode[wqe->wr.opcode],
@@ -1884,7 +1882,7 @@ struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
                u32 s_last;
 
                trdma_clean_swqe(qp, wqe);
-               rvt_put_swqe(wqe);
+               rvt_put_qp_swqe(qp, wqe);
                rvt_qp_wqe_unreserve(qp, wqe);
                s_last = qp->s_last;
                trace_hfi1_qp_send_completion(qp, wqe, s_last);
@@ -2643,10 +2641,7 @@ static noinline int rc_rcv_error(struct ib_other_headers *ohdr, void *data,
                len = be32_to_cpu(reth->length);
                if (unlikely(offset + len != e->rdma_sge.sge_length))
                        goto unlock_done;
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
+               release_rdma_sge_mr(e);
                if (len != 0) {
                        u32 rkey = be32_to_cpu(reth->rkey);
                        u64 vaddr = get_ib_reth_vaddr(reth);
@@ -3088,10 +3083,7 @@ send_last:
                        update_ack_queue(qp, next);
                }
                e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
+               release_rdma_sge_mr(e);
                reth = &ohdr->u.rc.reth;
                len = be32_to_cpu(reth->length);
                if (len) {
@@ -3166,10 +3158,7 @@ send_last:
                        update_ack_queue(qp, next);
                }
                e = &qp->s_ack_queue[qp->r_head_ack_queue];
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
+               release_rdma_sge_mr(e);
                /* Process OPFN special virtual address */
                if (opfn) {
                        opfn_conn_response(qp, e, ateth);
index 8e0935b..5ed5e85 100644 (file)
@@ -41,6 +41,14 @@ static inline u32 restart_sge(struct rvt_sge_state *ss, struct rvt_swqe *wqe,
        return rvt_restart_sge(ss, wqe, len);
 }
 
+static inline void release_rdma_sge_mr(struct rvt_ack_entry *e)
+{
+       if (e->rdma_sge.mr) {
+               rvt_put_mr(e->rdma_sge.mr);
+               e->rdma_sge.mr = NULL;
+       }
+}
+
 struct rvt_ack_entry *find_prev_entry(struct rvt_qp *qp, u32 psn, u8 *prev,
                                      u8 *prev_ack, bool *scheduled);
 int do_rc_ack(struct rvt_qp *qp, u32 aeth, u32 psn, int opcode, u64 val,
index 124a3ec..23ac605 100644 (file)
@@ -524,7 +524,7 @@ void _hfi1_do_send(struct work_struct *work)
 
 /**
  * hfi1_do_send - perform a send on a QP
- * @work: contains a pointer to the QP
+ * @qp: a pointer to the QP
  * @in_thread: true if in a workqueue thread
  *
  * Process entries in the send work queue until credit or queue is
index 43cbce7..6fb9303 100644 (file)
@@ -67,8 +67,6 @@ static u32 mask_generation(u32 a)
 #define TID_RDMA_DESTQP_FLOW_SHIFT      11
 #define TID_RDMA_DESTQP_FLOW_MASK       0x1f
 
-#define TID_FLOW_SW_PSN BIT(0)
-
 #define TID_OPFN_QP_CTXT_MASK 0xff
 #define TID_OPFN_QP_CTXT_SHIFT 56
 #define TID_OPFN_QP_KDETH_MASK 0xff
@@ -128,6 +126,15 @@ static int make_tid_rdma_ack(struct rvt_qp *qp,
                             struct ib_other_headers *ohdr,
                             struct hfi1_pkt_state *ps);
 static void hfi1_do_tid_send(struct rvt_qp *qp);
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx);
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+                            struct ib_other_headers *ohdr,
+                            struct rvt_qp *qp, u32 psn, int diff, bool fecn);
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+                                  struct hfi1_qp_priv *priv,
+                                  struct hfi1_ctxtdata *rcd,
+                                  struct tid_rdma_flow *flow,
+                                  bool fecn);
 
 static u64 tid_rdma_opfn_encode(struct tid_rdma_params *p)
 {
@@ -776,7 +783,6 @@ int hfi1_kern_setup_hw_flow(struct hfi1_ctxtdata *rcd, struct rvt_qp *qp)
                rcd->flows[fs->index].generation = fs->generation;
        fs->generation = kern_setup_hw_flow(rcd, fs->index);
        fs->psn = 0;
-       fs->flags = 0;
        dequeue_tid_waiter(rcd, &rcd->flow_queue, qp);
        /* get head before dropping lock */
        fqp = first_qp(rcd, &rcd->flow_queue);
@@ -1807,6 +1813,7 @@ sync_check:
                        goto done;
 
                hfi1_kern_clear_hw_flow(req->rcd, qp);
+               qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
                req->state = TID_REQUEST_ACTIVE;
        }
 
@@ -2036,10 +2043,7 @@ static int tid_rdma_rcv_error(struct hfi1_packet *packet,
                if (psn != e->psn || len != req->total_len)
                        goto unlock;
 
-               if (e->rdma_sge.mr) {
-                       rvt_put_mr(e->rdma_sge.mr);
-                       e->rdma_sge.mr = NULL;
-               }
+               release_rdma_sge_mr(e);
 
                rkey = be32_to_cpu(reth->rkey);
                vaddr = get_ib_reth_vaddr(reth);
@@ -2238,7 +2242,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
        struct ib_reth *reth;
        struct hfi1_qp_priv *qpriv = qp->priv;
        u32 bth0, psn, len, rkey;
-       bool is_fecn;
+       bool fecn;
        u8 next;
        u64 vaddr;
        int diff;
@@ -2248,7 +2252,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
        if (hfi1_ruc_check_hdr(ibp, packet))
                return;
 
-       is_fecn = process_ecn(qp, packet);
+       fecn = process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        trace_hfi1_rsp_rcv_tid_read_req(qp, psn);
 
@@ -2267,9 +2271,8 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 
        diff = delta_psn(psn, qp->r_psn);
        if (unlikely(diff)) {
-               if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-                       return;
-               goto send_ack;
+               tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+               return;
        }
 
        /* We've verified the request, insert it into the ack queue. */
@@ -2285,10 +2288,7 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
                update_ack_queue(qp, next);
        }
        e = &qp->s_ack_queue[qp->r_head_ack_queue];
-       if (e->rdma_sge.mr) {
-               rvt_put_mr(e->rdma_sge.mr);
-               e->rdma_sge.mr = NULL;
-       }
+       release_rdma_sge_mr(e);
 
        rkey = be32_to_cpu(reth->rkey);
        qp->r_len = len;
@@ -2324,11 +2324,11 @@ void hfi1_rc_rcv_tid_rdma_read_req(struct hfi1_packet *packet)
 
        /* Schedule the send tasklet. */
        qp->s_flags |= RVT_S_RESP_PENDING;
+       if (fecn)
+               qp->s_flags |= RVT_S_ECN;
        hfi1_schedule_send(qp);
 
        spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (is_fecn)
-               goto send_ack;
        return;
 
 nack_inv_unlock:
@@ -2345,8 +2345,6 @@ nack_acc:
        rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
        qp->r_ack_psn = qp->r_psn;
-send_ack:
-       hfi1_send_rc_ack(packet, is_fecn);
 }
 
 u32 hfi1_build_tid_rdma_read_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@@ -2463,12 +2461,12 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
        struct tid_rdma_request *req;
        struct tid_rdma_flow *flow;
        u32 opcode, aeth;
-       bool is_fecn;
+       bool fecn;
        unsigned long flags;
        u32 kpsn, ipsn;
 
        trace_hfi1_sender_rcv_tid_read_resp(qp);
-       is_fecn = process_ecn(qp, packet);
+       fecn = process_ecn(qp, packet);
        kpsn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        aeth = be32_to_cpu(ohdr->u.tid_rdma.r_rsp.aeth);
        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@@ -2481,8 +2479,43 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
 
        flow = &req->flows[req->clear_tail];
        /* When header suppression is disabled */
-       if (cmp_psn(ipsn, flow->flow_state.ib_lpsn))
+       if (cmp_psn(ipsn, flow->flow_state.ib_lpsn)) {
+               update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
+               if (cmp_psn(kpsn, flow->flow_state.r_next_psn))
+                       goto ack_done;
+               flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
+               /*
+                * Copy the payload to destination buffer if this packet is
+                * delivered as an eager packet due to RSM rule and FECN.
+                * The RSM rule selects FECN bit in BTH and SH bit in
+                * KDETH header and therefore will not match the last
+                * packet of each segment that has SH bit cleared.
+                */
+               if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+                       struct rvt_sge_state ss;
+                       u32 len;
+                       u32 tlen = packet->tlen;
+                       u16 hdrsize = packet->hlen;
+                       u8 pad = packet->pad;
+                       u8 extra_bytes = pad + packet->extra_byte +
+                               (SIZE_OF_CRC << 2);
+                       u32 pmtu = qp->pmtu;
+
+                       if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+                               goto ack_op_err;
+                       len = restart_sge(&ss, req->e.swqe, ipsn, pmtu);
+                       if (unlikely(len < pmtu))
+                               goto ack_op_err;
+                       rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+                                    false);
+                       /* Raise the sw sequence check flag for next packet */
+                       priv->s_flags |= HFI1_R_TID_SW_PSN;
+               }
+
                goto ack_done;
+       }
+       flow->flow_state.r_next_psn = mask_psn(kpsn + 1);
        req->ack_pending--;
        priv->pending_tid_r_segs--;
        qp->s_num_rd_atomic--;
@@ -2524,6 +2557,7 @@ void hfi1_rc_rcv_tid_rdma_read_resp(struct hfi1_packet *packet)
             req->comp_seg == req->cur_seg) ||
            priv->tid_r_comp == priv->tid_r_reqs) {
                hfi1_kern_clear_hw_flow(priv->rcd, qp);
+               priv->s_flags &= ~HFI1_R_TID_SW_PSN;
                if (req->state == TID_REQUEST_SYNC)
                        req->state = TID_REQUEST_ACTIVE;
        }
@@ -2545,8 +2579,6 @@ ack_op_err:
 
 ack_done:
        spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (is_fecn)
-               hfi1_send_rc_ack(packet, is_fecn);
 }
 
 void hfi1_kern_read_tid_flow_free(struct rvt_qp *qp)
@@ -2773,9 +2805,9 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
                                rvt_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
                                return ret;
                        }
-                       if (priv->flow_state.flags & TID_FLOW_SW_PSN) {
+                       if (priv->s_flags & HFI1_R_TID_SW_PSN) {
                                diff = cmp_psn(psn,
-                                              priv->flow_state.r_next_psn);
+                                              flow->flow_state.r_next_psn);
                                if (diff > 0) {
                                        if (!(qp->r_flags & RVT_R_RDMAR_SEQ))
                                                restart_tid_rdma_read_req(rcd,
@@ -2811,22 +2843,15 @@ static bool handle_read_kdeth_eflags(struct hfi1_ctxtdata *rcd,
                                                qp->r_flags &=
                                                        ~RVT_R_RDMAR_SEQ;
                                }
-                               priv->flow_state.r_next_psn++;
+                               flow->flow_state.r_next_psn =
+                                       mask_psn(psn + 1);
                        } else {
-                               u64 reg;
                                u32 last_psn;
 
-                               /*
-                                * The only sane way to get the amount of
-                                * progress is to read the HW flow state.
-                                */
-                               reg = read_uctxt_csr(dd, rcd->ctxt,
-                                                    RCV_TID_FLOW_TABLE +
-                                                    (8 * flow->idx));
-                               last_psn = mask_psn(reg);
-
-                               priv->flow_state.r_next_psn = last_psn;
-                               priv->flow_state.flags |= TID_FLOW_SW_PSN;
+                               last_psn = read_r_next_psn(dd, rcd->ctxt,
+                                                          flow->idx);
+                               flow->flow_state.r_next_psn = last_psn;
+                               priv->s_flags |= HFI1_R_TID_SW_PSN;
                                /*
                                 * If no request has been restarted yet,
                                 * restart the current one.
@@ -2891,10 +2916,11 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
        struct rvt_ack_entry *e;
        struct tid_rdma_request *req;
        struct tid_rdma_flow *flow;
+       int diff = 0;
 
        trace_hfi1_msg_handle_kdeth_eflags(NULL, "Kdeth error: rhf ",
                                           packet->rhf);
-       if (packet->rhf & (RHF_VCRC_ERR | RHF_ICRC_ERR))
+       if (packet->rhf & RHF_ICRC_ERR)
                return ret;
 
        packet->ohdr = &hdr->u.oth;
@@ -2974,17 +3000,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
                switch (rte) {
                case RHF_RTE_EXPECTED_FLOW_SEQ_ERR:
                        if (!(qpriv->s_flags & HFI1_R_TID_SW_PSN)) {
-                               u64 reg;
-
                                qpriv->s_flags |= HFI1_R_TID_SW_PSN;
-                               /*
-                                * The only sane way to get the amount of
-                                * progress is to read the HW flow state.
-                                */
-                               reg = read_uctxt_csr(dd, rcd->ctxt,
-                                                    RCV_TID_FLOW_TABLE +
-                                                    (8 * flow->idx));
-                               flow->flow_state.r_next_psn = mask_psn(reg);
+                               flow->flow_state.r_next_psn =
+                                       read_r_next_psn(dd, rcd->ctxt,
+                                                       flow->idx);
                                qpriv->r_next_psn_kdeth =
                                        flow->flow_state.r_next_psn;
                                goto nak_psn;
@@ -2997,10 +3016,12 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
                                 * mismatch could be due to packets that were
                                 * already in flight.
                                 */
-                               if (psn != flow->flow_state.r_next_psn) {
-                                       psn = flow->flow_state.r_next_psn;
+                               diff = cmp_psn(psn,
+                                              flow->flow_state.r_next_psn);
+                               if (diff > 0)
                                        goto nak_psn;
-                               }
+                               else if (diff < 0)
+                                       break;
 
                                qpriv->s_nak_state = 0;
                                /*
@@ -3011,8 +3032,10 @@ bool hfi1_handle_kdeth_eflags(struct hfi1_ctxtdata *rcd,
                                if (psn == full_flow_psn(flow,
                                                         flow->flow_state.lpsn))
                                        ret = false;
+                               flow->flow_state.r_next_psn =
+                                       mask_psn(psn + 1);
                                qpriv->r_next_psn_kdeth =
-                                       ++flow->flow_state.r_next_psn;
+                                       flow->flow_state.r_next_psn;
                        }
                        break;
 
@@ -3517,8 +3540,10 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
                if (qpriv->r_tid_alloc == qpriv->r_tid_head) {
                        /* If all data has been received, clear the flow */
                        if (qpriv->flow_state.index < RXE_NUM_TID_FLOWS &&
-                           !qpriv->alloc_w_segs)
+                           !qpriv->alloc_w_segs) {
                                hfi1_kern_clear_hw_flow(rcd, qp);
+                               qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+                       }
                        break;
                }
 
@@ -3544,8 +3569,7 @@ static void hfi1_tid_write_alloc_resources(struct rvt_qp *qp, bool intr_ctx)
                if (qpriv->sync_pt && !qpriv->alloc_w_segs) {
                        hfi1_kern_clear_hw_flow(rcd, qp);
                        qpriv->sync_pt = false;
-                       if (qpriv->s_flags & HFI1_R_TID_SW_PSN)
-                               qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
+                       qpriv->s_flags &= ~HFI1_R_TID_SW_PSN;
                }
 
                /* Allocate flow if we don't have one */
@@ -3687,7 +3711,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
        struct hfi1_qp_priv *qpriv = qp->priv;
        struct tid_rdma_request *req;
        u32 bth0, psn, len, rkey, num_segs;
-       bool is_fecn;
+       bool fecn;
        u8 next;
        u64 vaddr;
        int diff;
@@ -3696,7 +3720,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
        if (hfi1_ruc_check_hdr(ibp, packet))
                return;
 
-       is_fecn = process_ecn(qp, packet);
+       fecn = process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        trace_hfi1_rsp_rcv_tid_write_req(qp, psn);
 
@@ -3713,9 +3737,8 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
        num_segs = DIV_ROUND_UP(len, qpriv->tid_rdma.local.max_len);
        diff = delta_psn(psn, qp->r_psn);
        if (unlikely(diff)) {
-               if (tid_rdma_rcv_error(packet, ohdr, qp, psn, diff))
-                       return;
-               goto send_ack;
+               tid_rdma_rcv_err(packet, ohdr, qp, psn, diff, fecn);
+               return;
        }
 
        /*
@@ -3751,10 +3774,7 @@ void hfi1_rc_rcv_tid_rdma_write_req(struct hfi1_packet *packet)
                goto update_head;
        }
 
-       if (e->rdma_sge.mr) {
-               rvt_put_mr(e->rdma_sge.mr);
-               e->rdma_sge.mr = NULL;
-       }
+       release_rdma_sge_mr(e);
 
        /* The length needs to be in multiples of PAGE_SIZE */
        if (!len || len & ~PAGE_MASK)
@@ -3834,11 +3854,11 @@ update_head:
 
        /* Schedule the send tasklet. */
        qp->s_flags |= RVT_S_RESP_PENDING;
+       if (fecn)
+               qp->s_flags |= RVT_S_ECN;
        hfi1_schedule_send(qp);
 
        spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (is_fecn)
-               goto send_ack;
        return;
 
 nack_inv_unlock:
@@ -3855,8 +3875,6 @@ nack_acc:
        rvt_rc_error(qp, IB_WC_LOC_PROT_ERR);
        qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
        qp->r_ack_psn = qp->r_psn;
-send_ack:
-       hfi1_send_rc_ack(packet, is_fecn);
 }
 
 u32 hfi1_build_tid_rdma_write_resp(struct rvt_qp *qp, struct rvt_ack_entry *e,
@@ -4073,10 +4091,10 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
        struct tid_rdma_flow *flow;
        enum ib_wc_status status;
        u32 opcode, aeth, psn, flow_psn, i, tidlen = 0, pktlen;
-       bool is_fecn;
+       bool fecn;
        unsigned long flags;
 
-       is_fecn = process_ecn(qp, packet);
+       fecn = process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        aeth = be32_to_cpu(ohdr->u.tid_rdma.w_rsp.aeth);
        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
@@ -4216,7 +4234,6 @@ void hfi1_rc_rcv_tid_rdma_write_resp(struct hfi1_packet *packet)
                qpriv->s_tid_cur = i;
        }
        qp->s_flags &= ~HFI1_S_WAIT_TID_RESP;
-
        hfi1_schedule_tid_send(qp);
        goto ack_done;
 
@@ -4225,9 +4242,9 @@ ack_op_err:
 ack_err:
        rvt_error_qp(qp, status);
 ack_done:
+       if (fecn)
+               qp->s_flags |= RVT_S_ECN;
        spin_unlock_irqrestore(&qp->s_lock, flags);
-       if (is_fecn)
-               hfi1_send_rc_ack(packet, is_fecn);
 }
 
 bool hfi1_build_tid_rdma_packet(struct rvt_swqe *wqe,
@@ -4307,7 +4324,9 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
        unsigned long flags;
        u32 psn, next;
        u8 opcode;
+       bool fecn;
 
+       fecn = process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0xff;
 
@@ -4320,9 +4339,53 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
        req = ack_to_tid_req(e);
        flow = &req->flows[req->clear_tail];
        if (cmp_psn(psn, full_flow_psn(flow, flow->flow_state.lpsn))) {
+               update_r_next_psn_fecn(packet, priv, rcd, flow, fecn);
+
                if (cmp_psn(psn, flow->flow_state.r_next_psn))
                        goto send_nak;
-               flow->flow_state.r_next_psn++;
+
+               flow->flow_state.r_next_psn = mask_psn(psn + 1);
+               /*
+                * Copy the payload to destination buffer if this packet is
+                * delivered as an eager packet due to RSM rule and FECN.
+                * The RSM rule selects FECN bit in BTH and SH bit in
+                * KDETH header and therefore will not match the last
+                * packet of each segment that has SH bit cleared.
+                */
+               if (fecn && packet->etype == RHF_RCV_TYPE_EAGER) {
+                       struct rvt_sge_state ss;
+                       u32 len;
+                       u32 tlen = packet->tlen;
+                       u16 hdrsize = packet->hlen;
+                       u8 pad = packet->pad;
+                       u8 extra_bytes = pad + packet->extra_byte +
+                               (SIZE_OF_CRC << 2);
+                       u32 pmtu = qp->pmtu;
+
+                       if (unlikely(tlen != (hdrsize + pmtu + extra_bytes)))
+                               goto send_nak;
+                       len = req->comp_seg * req->seg_len;
+                       len += delta_psn(psn,
+                               full_flow_psn(flow, flow->flow_state.spsn)) *
+                               pmtu;
+                       if (unlikely(req->total_len - len < pmtu))
+                               goto send_nak;
+
+                       /*
+                        * The e->rdma_sge field is set when TID RDMA WRITE REQ
+                        * is first received and is never modified thereafter.
+                        */
+                       ss.sge = e->rdma_sge;
+                       ss.sg_list = NULL;
+                       ss.num_sge = 1;
+                       ss.total_len = req->total_len;
+                       rvt_skip_sge(&ss, len, false);
+                       rvt_copy_sge(qp, &ss, packet->payload, pmtu, false,
+                                    false);
+                       /* Raise the sw sequence check flag for next packet */
+                       priv->r_next_psn_kdeth = mask_psn(psn + 1);
+                       priv->s_flags |= HFI1_R_TID_SW_PSN;
+               }
                goto exit;
        }
        flow->flow_state.r_next_psn = mask_psn(psn + 1);
@@ -4347,6 +4410,7 @@ void hfi1_rc_rcv_tid_rdma_write_data(struct hfi1_packet *packet)
                priv->r_tid_ack = priv->r_tid_tail;
 
        if (opcode == TID_OP(WRITE_DATA_LAST)) {
+               release_rdma_sge_mr(e);
                for (next = priv->r_tid_tail + 1; ; next++) {
                        if (next > rvt_size_atomic(&dev->rdi))
                                next = 0;
@@ -4386,6 +4450,8 @@ done:
        hfi1_schedule_tid_send(qp);
 exit:
        priv->r_next_psn_kdeth = flow->flow_state.r_next_psn;
+       if (fecn)
+               qp->s_flags |= RVT_S_ECN;
        spin_unlock_irqrestore(&qp->s_lock, flags);
        return;
 
@@ -4487,12 +4553,11 @@ void hfi1_rc_rcv_tid_rdma_ack(struct hfi1_packet *packet)
        struct tid_rdma_request *req;
        struct tid_rdma_flow *flow;
        u32 aeth, psn, req_psn, ack_psn, fspsn, resync_psn, ack_kpsn;
-       bool is_fecn;
        unsigned long flags;
        u16 fidx;
 
        trace_hfi1_tid_write_sender_rcv_tid_ack(qp, 0);
-       is_fecn = process_ecn(qp, packet);
+       process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
        aeth = be32_to_cpu(ohdr->u.tid_rdma.ack.aeth);
        req_psn = mask_psn(be32_to_cpu(ohdr->u.tid_rdma.ack.verbs_psn));
@@ -4846,10 +4911,10 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
        struct tid_rdma_flow *flow;
        struct tid_flow_state *fs = &qpriv->flow_state;
        u32 psn, generation, idx, gen_next;
-       bool is_fecn;
+       bool fecn;
        unsigned long flags;
 
-       is_fecn = process_ecn(qp, packet);
+       fecn = process_ecn(qp, packet);
        psn = mask_psn(be32_to_cpu(ohdr->bth[2]));
 
        generation = mask_psn(psn + 1) >> HFI1_KDETH_BTH_SEQ_SHIFT;
@@ -4940,6 +5005,8 @@ void hfi1_rc_rcv_tid_rdma_resync(struct hfi1_packet *packet)
        qpriv->s_flags |= RVT_S_ACK_PENDING;
        hfi1_schedule_tid_send(qp);
 bail:
+       if (fecn)
+               qp->s_flags |= RVT_S_ECN;
        spin_unlock_irqrestore(&qp->s_lock, flags);
 }
 
@@ -5449,3 +5516,48 @@ bool hfi1_tid_rdma_ack_interlock(struct rvt_qp *qp, struct rvt_ack_entry *e)
        }
        return false;
 }
+
+static u32 read_r_next_psn(struct hfi1_devdata *dd, u8 ctxt, u8 fidx)
+{
+       u64 reg;
+
+       /*
+        * The only sane way to get the amount of
+        * progress is to read the HW flow state.
+        */
+       reg = read_uctxt_csr(dd, ctxt, RCV_TID_FLOW_TABLE + (8 * fidx));
+       return mask_psn(reg);
+}
+
+static void tid_rdma_rcv_err(struct hfi1_packet *packet,
+                            struct ib_other_headers *ohdr,
+                            struct rvt_qp *qp, u32 psn, int diff, bool fecn)
+{
+       unsigned long flags;
+
+       tid_rdma_rcv_error(packet, ohdr, qp, psn, diff);
+       if (fecn) {
+               spin_lock_irqsave(&qp->s_lock, flags);
+               qp->s_flags |= RVT_S_ECN;
+               spin_unlock_irqrestore(&qp->s_lock, flags);
+       }
+}
+
+static void update_r_next_psn_fecn(struct hfi1_packet *packet,
+                                  struct hfi1_qp_priv *priv,
+                                  struct hfi1_ctxtdata *rcd,
+                                  struct tid_rdma_flow *flow,
+                                  bool fecn)
+{
+       /*
+        * If a start/middle packet is delivered here due to
+        * RSM rule and FECN, we need to update the r_next_psn.
+        */
+       if (fecn && packet->etype == RHF_RCV_TYPE_EAGER &&
+           !(priv->s_flags & HFI1_R_TID_SW_PSN)) {
+               struct hfi1_devdata *dd = rcd->dd;
+
+               flow->flow_state.r_next_psn =
+                       read_r_next_psn(dd, rcd->ctxt, flow->idx);
+       }
+}
index 53ab24e..1c53618 100644 (file)
@@ -76,10 +76,8 @@ struct tid_rdma_qp_params {
 struct tid_flow_state {
        u32 generation;
        u32 psn;
-       u32 r_next_psn;      /* next PSN to be received (in TID space) */
        u8 index;
        u8 last_index;
-       u8 flags;
 };
 
 enum tid_rdma_req_state {
index e62171f..de7a873 100644 (file)
@@ -86,14 +86,14 @@ DECLARE_EVENT_CLASS(hfi1_trace_template,
  * actual function to work and can not be in a macro.
  */
 #define __hfi1_trace_def(lvl) \
-void __hfi1_trace_##lvl(const char *funct, char *fmt, ...);            \
+void __printf(2, 3) __hfi1_trace_##lvl(const char *funct, char *fmt, ...); \
                                                                        \
 DEFINE_EVENT(hfi1_trace_template, hfi1_ ##lvl,                         \
        TP_PROTO(const char *function, struct va_format *vaf),          \
        TP_ARGS(function, vaf))
 
 #define __hfi1_trace_fn(lvl) \
-void __hfi1_trace_##lvl(const char *func, char *fmt, ...)              \
+void __printf(2, 3) __hfi1_trace_##lvl(const char *func, char *fmt, ...)\
 {                                                                      \
        struct va_format vaf = {                                        \
                .fmt = fmt,                                             \
index 548dfc4..4388b59 100644 (file)
@@ -53,7 +53,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
                            "tid_r_comp %u pending_tid_r_segs %u " \
                            "s_flags 0x%x ps_flags 0x%x iow_flags 0x%lx " \
                            "s_state 0x%x hw_flow_index %u generation 0x%x " \
-                           "fpsn 0x%x flow_flags 0x%x"
+                           "fpsn 0x%x"
 
 #define TID_REQ_PRN "[%s] qpn 0x%x newreq %u opcode 0x%x psn 0x%x lpsn 0x%x " \
                    "cur_seg %u comp_seg %u ack_seg %u alloc_seg %u " \
@@ -71,7 +71,7 @@ u16 hfi1_trace_get_tid_idx(u32 ent);
                            "pending_tid_w_segs %u sync_pt %s " \
                            "ps_nak_psn 0x%x ps_nak_state 0x%x " \
                            "prnr_nak_state 0x%x hw_flow_index %u generation "\
-                           "0x%x fpsn 0x%x flow_flags 0x%x resync %s" \
+                           "0x%x fpsn 0x%x resync %s" \
                            "r_next_psn_kdeth 0x%x"
 
 #define TID_WRITE_SENDER_PRN "[%s] qpn 0x%x newreq %u s_tid_cur %u " \
@@ -973,7 +973,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
                __field(u32, hw_flow_index)
                __field(u32, generation)
                __field(u32, fpsn)
-               __field(u32, flow_flags)
        ),
        TP_fast_assign(/* assign */
                struct hfi1_qp_priv *priv = qp->priv;
@@ -991,7 +990,6 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
                __entry->hw_flow_index = priv->flow_state.index;
                __entry->generation = priv->flow_state.generation;
                __entry->fpsn = priv->flow_state.psn;
-               __entry->flow_flags = priv->flow_state.flags;
        ),
        TP_printk(/* print */
                TID_READ_SENDER_PRN,
@@ -1007,8 +1005,7 @@ DECLARE_EVENT_CLASS(/* tid_read_sender */
                __entry->s_state,
                __entry->hw_flow_index,
                __entry->generation,
-               __entry->fpsn,
-               __entry->flow_flags
+               __entry->fpsn
        )
 );
 
@@ -1338,7 +1335,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
                __field(u32, hw_flow_index)
                __field(u32, generation)
                __field(u32, fpsn)
-               __field(u32, flow_flags)
                __field(bool, resync)
                __field(u32, r_next_psn_kdeth)
        ),
@@ -1360,7 +1356,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
                __entry->hw_flow_index = priv->flow_state.index;
                __entry->generation = priv->flow_state.generation;
                __entry->fpsn = priv->flow_state.psn;
-               __entry->flow_flags = priv->flow_state.flags;
                __entry->resync = priv->resync;
                __entry->r_next_psn_kdeth = priv->r_next_psn_kdeth;
        ),
@@ -1381,7 +1376,6 @@ DECLARE_EVENT_CLASS(/* tid_write_sp */
                __entry->hw_flow_index,
                __entry->generation,
                __entry->fpsn,
-               __entry->flow_flags,
                __entry->resync ? "yes" : "no",
                __entry->r_next_psn_kdeth
        )
index 55a56b3..1eb4105 100644 (file)
@@ -1223,15 +1223,16 @@ static inline send_routine get_send_routine(struct rvt_qp *qp,
        case IB_QPT_UD:
                break;
        case IB_QPT_UC:
-       case IB_QPT_RC: {
+       case IB_QPT_RC:
+               priv->s_running_pkt_size =
+                       (tx->s_cur_size + priv->s_running_pkt_size) / 2;
                if (piothreshold &&
-                   tx->s_cur_size <= min(piothreshold, qp->pmtu) &&
+                   priv->s_running_pkt_size <= min(piothreshold, qp->pmtu) &&
                    (BIT(ps->opcode & OPMASK) & pio_opmask[ps->opcode >> 5]) &&
                    iowait_sdma_pending(&priv->s_iowait) == 0 &&
                    !sdma_txreq_built(&tx->txreq))
                        return dd->process_pio_send;
                break;
-       }
        default:
                break;
        }
@@ -1739,15 +1740,15 @@ static struct rdma_hw_stats *alloc_hw_stats(struct ib_device *ibdev,
 
 static u64 hfi1_sps_ints(void)
 {
-       unsigned long flags;
+       unsigned long index, flags;
        struct hfi1_devdata *dd;
        u64 sps_ints = 0;
 
-       spin_lock_irqsave(&hfi1_devs_lock, flags);
-       list_for_each_entry(dd, &hfi1_dev_list, list) {
+       xa_lock_irqsave(&hfi1_dev_table, flags);
+       xa_for_each(&hfi1_dev_table, index, dd) {
                sps_ints += get_all_cpu_total(dd->int_counter);
        }
-       spin_unlock_irqrestore(&hfi1_devs_lock, flags);
+       xa_unlock_irqrestore(&hfi1_dev_table, flags);
        return sps_ints;
 }
 
index 62ace0b..7ecb8ed 100644 (file)
@@ -170,6 +170,7 @@ struct hfi1_qp_priv {
        struct tid_flow_state flow_state;
        struct tid_rdma_qp_params tid_rdma;
        struct rvt_qp *owner;
+       u16 s_running_pkt_size;
        u8 hdr_type; /* 9B or 16B */
        struct rvt_sge_state tid_ss;       /* SGE state pointer for 2nd leg */
        atomic_t n_requests;               /* # of TID RDMA requests in the */
index 2b07032..b49e60e 100644 (file)
@@ -162,12 +162,12 @@ static void deallocate_vnic_ctxt(struct hfi1_devdata *dd,
 
 void hfi1_vnic_setup(struct hfi1_devdata *dd)
 {
-       idr_init(&dd->vnic.vesw_idr);
+       xa_init(&dd->vnic.vesws);
 }
 
 void hfi1_vnic_cleanup(struct hfi1_devdata *dd)
 {
-       idr_destroy(&dd->vnic.vesw_idr);
+       WARN_ON(!xa_empty(&dd->vnic.vesws));
 }
 
 #define SUM_GRP_COUNTERS(stats, qstats, x_grp) do {            \
@@ -533,7 +533,7 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
        l4_type = hfi1_16B_get_l4(packet->ebuf);
        if (likely(l4_type == OPA_16B_L4_ETHR)) {
                vesw_id = HFI1_VNIC_GET_VESWID(packet->ebuf);
-               vinfo = idr_find(&dd->vnic.vesw_idr, vesw_id);
+               vinfo = xa_load(&dd->vnic.vesws, vesw_id);
 
                /*
                 * In case of invalid vesw id, count the error on
@@ -541,9 +541,10 @@ void hfi1_vnic_bypass_rcv(struct hfi1_packet *packet)
                 */
                if (unlikely(!vinfo)) {
                        struct hfi1_vnic_vport_info *vinfo_tmp;
-                       int id_tmp = 0;
+                       unsigned long index = 0;
 
-                       vinfo_tmp =  idr_get_next(&dd->vnic.vesw_idr, &id_tmp);
+                       vinfo_tmp = xa_find(&dd->vnic.vesws, &index, ULONG_MAX,
+                                       XA_PRESENT);
                        if (vinfo_tmp) {
                                spin_lock(&vport_cntr_lock);
                                vinfo_tmp->stats[0].netstats.rx_nohandler++;
@@ -597,8 +598,7 @@ static int hfi1_vnic_up(struct hfi1_vnic_vport_info *vinfo)
        if (!vinfo->vesw_id)
                return -EINVAL;
 
-       rc = idr_alloc(&dd->vnic.vesw_idr, vinfo, vinfo->vesw_id,
-                      vinfo->vesw_id + 1, GFP_NOWAIT);
+       rc = xa_insert(&dd->vnic.vesws, vinfo->vesw_id, vinfo, GFP_KERNEL);
        if (rc < 0)
                return rc;
 
@@ -624,7 +624,7 @@ static void hfi1_vnic_down(struct hfi1_vnic_vport_info *vinfo)
        clear_bit(HFI1_VNIC_UP, &vinfo->flags);
        netif_carrier_off(vinfo->netdev);
        netif_tx_disable(vinfo->netdev);
-       idr_remove(&dd->vnic.vesw_idr, vinfo->vesw_id);
+       xa_erase(&dd->vnic.vesws, vinfo->vesw_id);
 
        /* ensure irqs see the change */
        msix_vnic_synchronize_irq(dd);
index e2a7f14..eee5205 100644 (file)
@@ -7,8 +7,8 @@ ccflags-y :=  -I $(srctree)/drivers/net/ethernet/hisilicon/hns3
 obj-$(CONFIG_INFINIBAND_HNS) += hns-roce.o
 hns-roce-objs := hns_roce_main.o hns_roce_cmd.o hns_roce_pd.o \
        hns_roce_ah.o hns_roce_hem.o hns_roce_mr.o hns_roce_qp.o \
-       hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o
+       hns_roce_cq.o hns_roce_alloc.o hns_roce_db.o hns_roce_srq.o hns_roce_restrack.o
 obj-$(CONFIG_INFINIBAND_HNS_HIP06) += hns-roce-hw-v1.o
 hns-roce-hw-v1-objs := hns_roce_hw_v1.o
 obj-$(CONFIG_INFINIBAND_HNS_HIP08) += hns-roce-hw-v2.o
-hns-roce-hw-v2-objs := hns_roce_hw_v2.o
+hns-roce-hw-v2-objs := hns_roce_hw_v2.o hns_roce_hw_v2_dfx.o
index b3c8c45..cdd2ac2 100644 (file)
 #define HNS_ROCE_VLAN_SL_BIT_MASK      7
 #define HNS_ROCE_VLAN_SL_SHIFT         13
 
-struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
-                                struct rdma_ah_attr *ah_attr,
-                                u32 flags,
-                                struct ib_udata *udata)
+int hns_roce_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+                      u32 flags, struct ib_udata *udata)
 {
-       struct hns_roce_dev *hr_dev = to_hr_dev(ibpd->device);
+       struct hns_roce_dev *hr_dev = to_hr_dev(ibah->device);
        const struct ib_gid_attr *gid_attr;
        struct device *dev = hr_dev->dev;
-       struct hns_roce_ah *ah;
+       struct hns_roce_ah *ah = to_hr_ah(ibah);
        u16 vlan_tag = 0xffff;
        const struct ib_global_route *grh = rdma_ah_read_grh(ah_attr);
        bool vlan_en = false;
+       int ret;
 
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
+       gid_attr = ah_attr->grh.sgid_attr;
+       ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag, NULL);
+       if (ret)
+               return ret;
 
        /* Get mac address */
        memcpy(ah->av.mac, ah_attr->roce.dmac, ETH_ALEN);
 
-       gid_attr = ah_attr->grh.sgid_attr;
-       if (is_vlan_dev(gid_attr->ndev)) {
-               vlan_tag = vlan_dev_vlan_id(gid_attr->ndev);
+       if (vlan_tag < VLAN_CFI_MASK) {
                vlan_en = true;
-       }
-
-       if (vlan_tag < 0x1000)
                vlan_tag |= (rdma_ah_get_sl(ah_attr) &
                             HNS_ROCE_VLAN_SL_BIT_MASK) <<
                             HNS_ROCE_VLAN_SL_SHIFT;
+       }
 
-       ah->av.port_pd = cpu_to_be32(to_hr_pd(ibpd)->pdn |
+       ah->av.port_pd = cpu_to_le32(to_hr_pd(ibah->pd)->pdn |
                                     (rdma_ah_get_port_num(ah_attr) <<
                                     HNS_ROCE_PORT_NUM_SHIFT));
        ah->av.gid_index = grh->sgid_index;
@@ -86,7 +82,7 @@ struct ib_ah *hns_roce_create_ah(struct ib_pd *ibpd,
        ah->av.sl_tclass_flowlabel = cpu_to_le32(rdma_ah_get_sl(ah_attr) <<
                                                 HNS_ROCE_SL_SHIFT);
 
-       return &ah->ibah;
+       return 0;
 }
 
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -111,9 +107,7 @@ int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
        return 0;
 }
 
-int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
+void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-       kfree(to_hr_ah(ah));
-
-       return 0;
+       return;
 }
index 059fd1d..2b6ac64 100644 (file)
@@ -53,6 +53,7 @@ enum {
        HNS_ROCE_CMD_QUERY_QPC          = 0x42,
 
        HNS_ROCE_CMD_MODIFY_CQC         = 0x52,
+       HNS_ROCE_CMD_QUERY_CQC          = 0x53,
        /* CQC BT commands */
        HNS_ROCE_CMD_WRITE_CQC_BT0      = 0x10,
        HNS_ROCE_CMD_WRITE_CQC_BT1      = 0x11,
index f4c92a7..8e95a1a 100644 (file)
 #define roce_set_bit(origin, shift, val) \
        roce_set_field((origin), (1ul << (shift)), (shift), (val))
 
-/*
- * roce_hw_index_cmp_lt - Compare two hardware index values in hisilicon
- *                        SOC, check if a is less than b.
- * @a: hardware index value
- * @b: hardware index value
- * @bits: the number of bits of a and b, range: 0~31.
- *
- * Hardware index increases continuously till max value, and then restart
- * from zero, again and again. Because the bits of reg field is often
- * limited, the reg field can only hold the low bits of the hardware index
- * in hisilicon SOC.
- * In some scenes we need to compare two values(a,b) getted from two reg
- * fields in this driver, for example:
- * If a equals 0xfffe, b equals 0x1 and bits equals 16, we think b has
- * incresed from 0xffff to 0x1 and a is less than b.
- * If a equals 0xfffe, b equals 0x0xf001 and bits equals 16, we think a
- * is bigger than b.
- *
- * Return true on a less than b, otherwise false.
- */
-#define roce_hw_index_mask(bits)       ((1ul << (bits)) - 1)
-#define roce_hw_index_shift(bits)      (32 - (bits))
-#define roce_hw_index_cmp_lt(a, b, bits) \
-       ((int)((((a) - (b)) & roce_hw_index_mask(bits)) << \
-               roce_hw_index_shift(bits)) < 0)
-
 #define ROCEE_GLB_CFG_ROCEE_DB_SQ_MODE_S 3
 #define ROCEE_GLB_CFG_ROCEE_DB_OTH_MODE_S 4
 
 #define ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M   \
        (((1UL << 28) - 1) << ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)
 
-#define ROCEE_SDB_PTR_CMP_BITS 28
-
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_S 0
 #define ROCEE_SDB_INV_CNT_SDB_INV_CNT_M   \
        (((1UL << 16) - 1) << ROCEE_SDB_INV_CNT_SDB_INV_CNT_S)
 #define ROCEE_CAEP_AE_MASK_REG                 0x6C8
 #define ROCEE_CAEP_AE_ST_REG                   0x6CC
 
-#define ROCEE_SDB_ISSUE_PTR_REG                        0x758
-#define ROCEE_SDB_SEND_PTR_REG                 0x75C
 #define ROCEE_CAEP_CQE_WCMD_EMPTY              0x850
 #define ROCEE_SCAEP_WR_CQE_CNT                 0x8D0
-#define ROCEE_SDB_INV_CNT_REG                  0x9A4
-#define ROCEE_SDB_RETRY_CNT_REG                        0x9AC
-#define ROCEE_TSP_BP_ST_REG                    0x9EC
 #define ROCEE_ECC_UCERR_ALM0_REG               0xB34
 #define ROCEE_ECC_CERR_ALM0_REG                        0xB40
 
index 1dfe562..9caf350 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <linux/platform_device.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 #include "hns_roce_device.h"
 #include "hns_roce_cmd.h"
 #include "hns_roce_hem.h"
@@ -127,13 +128,9 @@ static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
                goto err_out;
        }
 
-       /* The cq insert radix tree */
-       spin_lock_irq(&cq_table->lock);
-       /* Radix_tree: The associated pointer and long integer key value like */
-       ret = radix_tree_insert(&cq_table->tree, hr_cq->cqn, hr_cq);
-       spin_unlock_irq(&cq_table->lock);
+       ret = xa_err(xa_store(&cq_table->array, hr_cq->cqn, hr_cq, GFP_KERNEL));
        if (ret) {
-               dev_err(dev, "CQ alloc.Failed to radix_tree_insert.\n");
+               dev_err(dev, "CQ alloc failed xa_store.\n");
                goto err_put;
        }
 
@@ -141,7 +138,7 @@ static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
        mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
        if (IS_ERR(mailbox)) {
                ret = PTR_ERR(mailbox);
-               goto err_radix;
+               goto err_xa;
        }
 
        hr_dev->hw->write_cqc(hr_dev, hr_cq, mailbox->buf, mtts, dma_handle,
@@ -152,7 +149,7 @@ static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
        hns_roce_free_cmd_mailbox(hr_dev, mailbox);
        if (ret) {
                dev_err(dev, "CQ alloc.Failed to cmd mailbox.\n");
-               goto err_radix;
+               goto err_xa;
        }
 
        hr_cq->cons_index = 0;
@@ -164,10 +161,8 @@ static int hns_roce_cq_alloc(struct hns_roce_dev *hr_dev, int nent,
 
        return 0;
 
-err_radix:
-       spin_lock_irq(&cq_table->lock);
-       radix_tree_delete(&cq_table->tree, hr_cq->cqn);
-       spin_unlock_irq(&cq_table->lock);
+err_xa:
+       xa_erase(&cq_table->array, hr_cq->cqn);
 
 err_put:
        hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
@@ -197,6 +192,8 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
                dev_err(dev, "HW2SW_CQ failed (%d) for CQN %06lx\n", ret,
                        hr_cq->cqn);
 
+       xa_erase(&cq_table->array, hr_cq->cqn);
+
        /* Waiting interrupt process procedure carried out */
        synchronize_irq(hr_dev->eq_table.eq[hr_cq->vector].irq);
 
@@ -205,10 +202,6 @@ void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
                complete(&hr_cq->free);
        wait_for_completion(&hr_cq->free);
 
-       spin_lock_irq(&cq_table->lock);
-       radix_tree_delete(&cq_table->tree, hr_cq->cqn);
-       spin_unlock_irq(&cq_table->lock);
-
        hns_roce_table_put(hr_dev, &cq_table->table, hr_cq->cqn);
        hns_roce_bitmap_free(&cq_table->bitmap, hr_cq->cqn, BITMAP_NO_RR);
 }
@@ -309,7 +302,6 @@ static void hns_roce_ib_free_cq_buf(struct hns_roce_dev *hr_dev,
 
 struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
                                    const struct ib_cq_init_attr *attr,
-                                   struct ib_ucontext *context,
                                    struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
@@ -321,6 +313,8 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
        int vector = attr->comp_vector;
        int cq_entries = attr->cqe;
        int ret;
+       struct hns_roce_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct hns_roce_ucontext, ibucontext);
 
        if (cq_entries < 1 || cq_entries > hr_dev->caps.max_cqes) {
                dev_err(dev, "Creat CQ failed. entries=%d, max=%d\n",
@@ -339,7 +333,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
        hr_cq->ib_cq.cqe = cq_entries - 1;
        spin_lock_init(&hr_cq->lock);
 
-       if (context) {
+       if (udata) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
                        dev_err(dev, "Failed to copy_from_udata.\n");
                        ret = -EFAULT;
@@ -357,8 +351,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
 
                if ((hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
                    (udata->outlen >= sizeof(resp))) {
-                       ret = hns_roce_db_map_user(to_hr_ucontext(context),
-                                                  udata, ucmd.db_addr,
+                       ret = hns_roce_db_map_user(context, udata, ucmd.db_addr,
                                                   &hr_cq->db);
                        if (ret) {
                                dev_err(dev, "cq record doorbell map failed!\n");
@@ -369,7 +362,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
                }
 
                /* Get user space parameters */
-               uar = &to_hr_ucontext(context)->uar;
+               uar = &context->uar;
        } else {
                if (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) {
                        ret = hns_roce_alloc_db(hr_dev, &hr_cq->db, 1);
@@ -408,7 +401,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
         * problems if tptr is set to zero here, so we initialze it in user
         * space.
         */
-       if (!context && hr_cq->tptr_addr)
+       if (!udata && hr_cq->tptr_addr)
                *hr_cq->tptr_addr = 0;
 
        /* Get created cq handler and carry out event */
@@ -416,7 +409,7 @@ struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
        hr_cq->event = hns_roce_ib_cq_event;
        hr_cq->cq_depth = cq_entries;
 
-       if (context) {
+       if (udata) {
                resp.cqn = hr_cq->cqn;
                ret = ib_copy_to_udata(udata, &resp, sizeof(resp));
                if (ret)
@@ -429,21 +422,20 @@ err_cqc:
        hns_roce_free_cq(hr_dev, hr_cq);
 
 err_dbmap:
-       if (context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
+       if (udata && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB) &&
            (udata->outlen >= sizeof(resp)))
-               hns_roce_db_unmap_user(to_hr_ucontext(context),
-                                      &hr_cq->db);
+               hns_roce_db_unmap_user(context, &hr_cq->db);
 
 err_mtt:
        hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
-       if (context)
+       if (udata)
                ib_umem_release(hr_cq->umem);
        else
                hns_roce_ib_free_cq_buf(hr_dev, &hr_cq->hr_buf,
                                        hr_cq->ib_cq.cqe);
 
 err_db:
-       if (!context && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
+       if (!udata && (hr_dev->caps.flags & HNS_ROCE_CAP_FLAG_RECORD_DB))
                hns_roce_free_db(hr_dev, &hr_cq->db);
 
 err_cq:
@@ -452,24 +444,27 @@ err_cq:
 }
 EXPORT_SYMBOL_GPL(hns_roce_ib_create_cq);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq)
+int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
        struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
        int ret = 0;
 
        if (hr_dev->hw->destroy_cq) {
-               ret = hr_dev->hw->destroy_cq(ib_cq);
+               ret = hr_dev->hw->destroy_cq(ib_cq, udata);
        } else {
                hns_roce_free_cq(hr_dev, hr_cq);
                hns_roce_mtt_cleanup(hr_dev, &hr_cq->hr_buf.hr_mtt);
 
-               if (ib_cq->uobject) {
+               if (udata) {
                        ib_umem_release(hr_cq->umem);
 
                        if (hr_cq->db_en == 1)
                                hns_roce_db_unmap_user(
-                                       to_hr_ucontext(ib_cq->uobject->context),
+                                       rdma_udata_to_drv_context(
+                                               udata,
+                                               struct hns_roce_ucontext,
+                                               ibucontext),
                                        &hr_cq->db);
                } else {
                        /* Free the buff of stored cq */
@@ -491,8 +486,7 @@ void hns_roce_cq_completion(struct hns_roce_dev *hr_dev, u32 cqn)
        struct device *dev = hr_dev->dev;
        struct hns_roce_cq *cq;
 
-       cq = radix_tree_lookup(&hr_dev->cq_table.tree,
-                              cqn & (hr_dev->caps.num_cqs - 1));
+       cq = xa_load(&hr_dev->cq_table.array, cqn & (hr_dev->caps.num_cqs - 1));
        if (!cq) {
                dev_warn(dev, "Completion event for bogus CQ 0x%08x\n", cqn);
                return;
@@ -509,8 +503,7 @@ void hns_roce_cq_event(struct hns_roce_dev *hr_dev, u32 cqn, int event_type)
        struct device *dev = hr_dev->dev;
        struct hns_roce_cq *cq;
 
-       cq = radix_tree_lookup(&cq_table->tree,
-                              cqn & (hr_dev->caps.num_cqs - 1));
+       cq = xa_load(&cq_table->array, cqn & (hr_dev->caps.num_cqs - 1));
        if (cq)
                atomic_inc(&cq->refcount);
 
@@ -530,8 +523,7 @@ int hns_roce_init_cq_table(struct hns_roce_dev *hr_dev)
 {
        struct hns_roce_cq_table *cq_table = &hr_dev->cq_table;
 
-       spin_lock_init(&cq_table->lock);
-       INIT_RADIX_TREE(&cq_table->tree, GFP_ATOMIC);
+       xa_init(&cq_table->array);
 
        return hns_roce_bitmap_init(&cq_table->bitmap, hr_dev->caps.num_cqs,
                                    hr_dev->caps.num_cqs - 1,
index 9ee86da..563cf39 100644 (file)
@@ -505,7 +505,6 @@ struct hns_roce_uar_table {
 
 struct hns_roce_qp_table {
        struct hns_roce_bitmap          bitmap;
-       spinlock_t                      lock;
        struct hns_roce_hem_table       qp_table;
        struct hns_roce_hem_table       irrl_table;
        struct hns_roce_hem_table       trrl_table;
@@ -515,8 +514,7 @@ struct hns_roce_qp_table {
 
 struct hns_roce_cq_table {
        struct hns_roce_bitmap          bitmap;
-       spinlock_t                      lock;
-       struct radix_tree_root          tree;
+       struct xarray                   array;
        struct hns_roce_hem_table       table;
 };
 
@@ -869,6 +867,11 @@ struct hns_roce_work {
        int sub_type;
 };
 
+struct hns_roce_dfx_hw {
+       int (*query_cqc_info)(struct hns_roce_dev *hr_dev, u32 cqn,
+                             int *buffer);
+};
+
 struct hns_roce_hw {
        int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
        int (*cmq_init)(struct hns_roce_dev *hr_dev);
@@ -907,7 +910,7 @@ struct hns_roce_hw {
        int (*modify_qp)(struct ib_qp *ibqp, const struct ib_qp_attr *attr,
                         int attr_mask, enum ib_qp_state cur_state,
                         enum ib_qp_state new_state);
-       int (*destroy_qp)(struct ib_qp *ibqp);
+       int (*destroy_qp)(struct ib_qp *ibqp, struct ib_udata *udata);
        int (*qp_flow_control_init)(struct hns_roce_dev *hr_dev,
                         struct hns_roce_qp *hr_qp);
        int (*post_send)(struct ib_qp *ibqp, const struct ib_send_wr *wr,
@@ -916,8 +919,9 @@ struct hns_roce_hw {
                         const struct ib_recv_wr **bad_recv_wr);
        int (*req_notify_cq)(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
        int (*poll_cq)(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
-       int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr);
-       int (*destroy_cq)(struct ib_cq *ibcq);
+       int (*dereg_mr)(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr,
+                       struct ib_udata *udata);
+       int (*destroy_cq)(struct ib_cq *ibcq, struct ib_udata *udata);
        int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
        int (*init_eq)(struct hns_roce_dev *hr_dev);
        void (*cleanup_eq)(struct hns_roce_dev *hr_dev);
@@ -956,7 +960,7 @@ struct hns_roce_dev {
        int                     irq[HNS_ROCE_MAX_IRQ_NUM];
        u8 __iomem              *reg_base;
        struct hns_roce_caps    caps;
-       struct radix_tree_root  qp_table_tree;
+       struct xarray           qp_table_xa;
 
        unsigned char   dev_addr[HNS_ROCE_MAX_PORTS][MAC_ADDR_OCTET_NUM];
        u64                     sys_image_guid;
@@ -985,6 +989,7 @@ struct hns_roce_dev {
        const struct hns_roce_hw *hw;
        void                    *priv;
        struct workqueue_struct *irq_workq;
+       const struct hns_roce_dfx_hw *dfx;
 };
 
 static inline struct hns_roce_dev *to_hr_dev(struct ib_device *ib_dev)
@@ -1046,8 +1051,7 @@ static inline void hns_roce_write64_k(__le32 val[2], void __iomem *dest)
 static inline struct hns_roce_qp
        *__hns_roce_qp_lookup(struct hns_roce_dev *hr_dev, u32 qpn)
 {
-       return radix_tree_lookup(&hr_dev->qp_table_tree,
-                                qpn & (hr_dev->caps.num_qps - 1));
+       return xa_load(&hr_dev->qp_table_xa, qpn & (hr_dev->caps.num_qps - 1));
 }
 
 static inline void *hns_roce_buf_offset(struct hns_roce_buf *buf, int offset)
@@ -1107,16 +1111,13 @@ void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap,
                                unsigned long obj, int cnt,
                                int rr);
 
-struct ib_ah *hns_roce_create_ah(struct ib_pd *pd,
-                                struct rdma_ah_attr *ah_attr,
-                                u32 flags,
-                                struct ib_udata *udata);
+int hns_roce_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+                      u32 flags, struct ib_udata *udata);
 int hns_roce_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
+void hns_roce_destroy_ah(struct ib_ah *ah, u32 flags);
 
-int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                     struct ib_udata *udata);
-void hns_roce_dealloc_pd(struct ib_pd *pd);
+int hns_roce_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 struct ib_mr *hns_roce_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *hns_roce_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -1126,10 +1127,10 @@ int hns_roce_rereg_user_mr(struct ib_mr *mr, int flags, u64 start, u64 length,
                           u64 virt_addr, int mr_access_flags, struct ib_pd *pd,
                           struct ib_udata *udata);
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                               u32 max_num_sg);
+                               u32 max_num_sg, struct ib_udata *udata);
 int hns_roce_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                       unsigned int *sg_offset);
-int hns_roce_dereg_mr(struct ib_mr *ibmr);
+int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int hns_roce_hw2sw_mpt(struct hns_roce_dev *hr_dev,
                       struct hns_roce_cmd_mailbox *mailbox,
                       unsigned long mpt_index);
@@ -1147,13 +1148,13 @@ int hns_roce_buf_alloc(struct hns_roce_dev *hr_dev, u32 size, u32 max_direct,
 int hns_roce_ib_umem_write_mtt(struct hns_roce_dev *hr_dev,
                               struct hns_roce_mtt *mtt, struct ib_umem *umem);
 
-struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
-                                  struct ib_srq_init_attr *srq_init_attr,
-                                  struct ib_udata *udata);
+int hns_roce_create_srq(struct ib_srq *srq,
+                       struct ib_srq_init_attr *srq_init_attr,
+                       struct ib_udata *udata);
 int hns_roce_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr,
                        enum ib_srq_attr_mask srq_attr_mask,
                        struct ib_udata *udata);
-int hns_roce_destroy_srq(struct ib_srq *ibsrq);
+void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 
 struct ib_qp *hns_roce_create_qp(struct ib_pd *ib_pd,
                                 struct ib_qp_init_attr *init_attr,
@@ -1179,10 +1180,9 @@ int to_hr_qp_type(int qp_type);
 
 struct ib_cq *hns_roce_ib_create_cq(struct ib_device *ib_dev,
                                    const struct ib_cq_init_attr *attr,
-                                   struct ib_ucontext *context,
                                    struct ib_udata *udata);
 
-int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq);
+int hns_roce_ib_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata);
 void hns_roce_free_cq(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq);
 
 int hns_roce_db_map_user(struct hns_roce_ucontext *context,
@@ -1202,4 +1202,6 @@ int hns_get_gid_index(struct hns_roce_dev *hr_dev, u8 port, int gid_index);
 int hns_roce_init(struct hns_roce_dev *hr_dev);
 void hns_roce_exit(struct hns_roce_dev *hr_dev);
 
+int hns_roce_fill_res_entry(struct sk_buff *msg,
+                           struct rdma_restrack_entry *res);
 #endif /* _HNS_ROCE_DEVICE_H */
index c8555f7..4c5d0f1 100644 (file)
@@ -730,7 +730,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
        /* Reserved cq for loop qp */
        cq_init_attr.cqe                = HNS_ROCE_MIN_WQE_NUM * 2;
        cq_init_attr.comp_vector        = 0;
-       cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL, NULL);
+       cq = hns_roce_ib_create_cq(&hr_dev->ib_dev, &cq_init_attr, NULL);
        if (IS_ERR(cq)) {
                dev_err(dev, "Create cq for reserved loop qp failed!");
                return -ENOMEM;
@@ -749,7 +749,7 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
                goto alloc_mem_failed;
 
        pd->device  = ibdev;
-       ret = hns_roce_alloc_pd(pd, NULL, NULL);
+       ret = hns_roce_alloc_pd(pd, NULL);
        if (ret)
                goto alloc_pd_failed;
 
@@ -855,17 +855,17 @@ static int hns_roce_v1_rsv_lp_qp(struct hns_roce_dev *hr_dev)
 create_lp_qp_failed:
        for (i -= 1; i >= 0; i--) {
                hr_qp = free_mr->mr_free_qp[i];
-               if (hns_roce_v1_destroy_qp(&hr_qp->ibqp))
+               if (hns_roce_v1_destroy_qp(&hr_qp->ibqp, NULL))
                        dev_err(dev, "Destroy qp %d for mr free failed!\n", i);
        }
 
-       hns_roce_dealloc_pd(pd);
+       hns_roce_dealloc_pd(pd, NULL);
 
 alloc_pd_failed:
        kfree(pd);
 
 alloc_mem_failed:
-       if (hns_roce_ib_destroy_cq(cq))
+       if (hns_roce_ib_destroy_cq(cq, NULL))
                dev_err(dev, "Destroy cq for create_lp_qp failed!\n");
 
        return ret;
@@ -888,17 +888,17 @@ static void hns_roce_v1_release_lp_qp(struct hns_roce_dev *hr_dev)
                if (!hr_qp)
                        continue;
 
-               ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp);
+               ret = hns_roce_v1_destroy_qp(&hr_qp->ibqp, NULL);
                if (ret)
                        dev_err(dev, "Destroy qp %d for mr free failed(%d)!\n",
                                i, ret);
        }
 
-       ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq);
+       ret = hns_roce_ib_destroy_cq(&free_mr->mr_free_cq->ib_cq, NULL);
        if (ret)
                dev_err(dev, "Destroy cq for mr_free failed(%d)!\n", ret);
 
-       hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd);
+       hns_roce_dealloc_pd(&free_mr->mr_free_pd->ibpd, NULL);
 }
 
 static int hns_roce_db_init(struct hns_roce_dev *hr_dev)
@@ -1096,7 +1096,7 @@ free_work:
 }
 
 static int hns_roce_v1_dereg_mr(struct hns_roce_dev *hr_dev,
-                               struct hns_roce_mr *mr)
+                               struct hns_roce_mr *mr, struct ib_udata *udata)
 {
        struct device *dev = &hr_dev->pdev->dev;
        struct hns_roce_mr_free_work *mr_work;
@@ -1511,38 +1511,6 @@ static int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool dereset)
        return ret;
 }
 
-static int hns_roce_des_qp_init(struct hns_roce_dev *hr_dev)
-{
-       struct device *dev = &hr_dev->pdev->dev;
-       struct hns_roce_v1_priv *priv;
-       struct hns_roce_des_qp *des_qp;
-
-       priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-       des_qp = &priv->des_qp;
-
-       des_qp->requeue_flag = 1;
-       des_qp->qp_wq = create_singlethread_workqueue("hns_roce_destroy_qp");
-       if (!des_qp->qp_wq) {
-               dev_err(dev, "Create destroy qp workqueue failed!\n");
-               return -ENOMEM;
-       }
-
-       return 0;
-}
-
-static void hns_roce_des_qp_free(struct hns_roce_dev *hr_dev)
-{
-       struct hns_roce_v1_priv *priv;
-       struct hns_roce_des_qp *des_qp;
-
-       priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-       des_qp = &priv->des_qp;
-
-       des_qp->requeue_flag = 0;
-       flush_workqueue(des_qp->qp_wq);
-       destroy_workqueue(des_qp->qp_wq);
-}
-
 static int hns_roce_v1_profile(struct hns_roce_dev *hr_dev)
 {
        int i = 0;
@@ -1661,12 +1629,6 @@ static int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
                goto error_failed_tptr_init;
        }
 
-       ret = hns_roce_des_qp_init(hr_dev);
-       if (ret) {
-               dev_err(dev, "des qp init failed!\n");
-               goto error_failed_des_qp_init;
-       }
-
        ret = hns_roce_free_mr_init(hr_dev);
        if (ret) {
                dev_err(dev, "free mr init failed!\n");
@@ -1678,9 +1640,6 @@ static int hns_roce_v1_init(struct hns_roce_dev *hr_dev)
        return 0;
 
 error_failed_free_mr_init:
-       hns_roce_des_qp_free(hr_dev);
-
-error_failed_des_qp_init:
        hns_roce_tptr_free(hr_dev);
 
 error_failed_tptr_init:
@@ -1698,7 +1657,6 @@ static void hns_roce_v1_exit(struct hns_roce_dev *hr_dev)
 {
        hns_roce_port_enable(hr_dev, HNS_ROCE_PORT_DOWN);
        hns_roce_free_mr_free(hr_dev);
-       hns_roce_des_qp_free(hr_dev);
        hns_roce_tptr_free(hr_dev);
        hns_roce_bt_free(hr_dev);
        hns_roce_raq_free(hr_dev);
@@ -3642,307 +3600,22 @@ static int hns_roce_v1_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                hns_roce_v1_q_qp(ibqp, qp_attr, qp_attr_mask, qp_init_attr);
 }
 
-static void hns_roce_check_sdb_status(struct hns_roce_dev *hr_dev,
-                                     u32 *old_send, u32 *old_retry,
-                                     u32 *tsp_st, u32 *success_flags)
-{
-       __le32 *old_send_tmp, *old_retry_tmp;
-       u32 sdb_retry_cnt;
-       u32 sdb_send_ptr;
-       u32 cur_cnt, old_cnt;
-       __le32 tmp, tmp1;
-       u32 send_ptr;
-
-       sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-       sdb_retry_cnt = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
-       tmp = cpu_to_le32(sdb_send_ptr);
-       tmp1 = cpu_to_le32(sdb_retry_cnt);
-       cur_cnt = roce_get_field(tmp, ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-                 roce_get_field(tmp1, ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-                                ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-
-       old_send_tmp = (__le32 *)old_send;
-       old_retry_tmp = (__le32 *)old_retry;
-       if (!roce_get_bit(*tsp_st, ROCEE_CNT_CLR_CE_CNT_CLR_CE_S)) {
-               old_cnt = roce_get_field(*old_send_tmp,
-                                        ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                        ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-                         roce_get_field(*old_retry_tmp,
-                                        ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-                                        ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-               if (cur_cnt - old_cnt > SDB_ST_CMP_VAL)
-                       *success_flags = 1;
-       } else {
-               old_cnt = roce_get_field(*old_send_tmp,
-                                        ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                        ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S);
-               if (cur_cnt - old_cnt > SDB_ST_CMP_VAL) {
-                       *success_flags = 1;
-               } else {
-                       send_ptr = roce_get_field(*old_send_tmp,
-                                           ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                           ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S) +
-                                  roce_get_field(tmp1,
-                                           ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_M,
-                                           ROCEE_SDB_RETRY_CNT_SDB_RETRY_CT_S);
-                       roce_set_field(*old_send_tmp,
-                                      ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                      ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S,
-                                      send_ptr);
-               }
-       }
-}
-
-static int check_qp_db_process_status(struct hns_roce_dev *hr_dev,
-                                     struct hns_roce_qp *hr_qp,
-                                     u32 sdb_issue_ptr,
-                                     u32 *sdb_inv_cnt,
-                                     u32 *wait_stage)
-{
-       struct device *dev = &hr_dev->pdev->dev;
-       u32 sdb_send_ptr, old_send;
-       __le32 sdb_issue_ptr_tmp;
-       __le32 sdb_send_ptr_tmp;
-       u32 success_flags = 0;
-       unsigned long end;
-       u32 old_retry;
-       u32 inv_cnt;
-       u32 tsp_st;
-       __le32 tmp;
-
-       if (*wait_stage > HNS_ROCE_V1_DB_STAGE2 ||
-           *wait_stage < HNS_ROCE_V1_DB_STAGE1) {
-               dev_err(dev, "QP(0x%lx) db status wait stage(%d) error!\n",
-                       hr_qp->qpn, *wait_stage);
-               return -EINVAL;
-       }
-
-       /* Calculate the total timeout for the entire verification process */
-       end = msecs_to_jiffies(HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS) + jiffies;
-
-       if (*wait_stage == HNS_ROCE_V1_DB_STAGE1) {
-               /* Query db process status, until hw process completely */
-               sdb_send_ptr = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-               while (roce_hw_index_cmp_lt(sdb_send_ptr, sdb_issue_ptr,
-                                           ROCEE_SDB_PTR_CMP_BITS)) {
-                       if (!time_before(jiffies, end)) {
-                               dev_dbg(dev, "QP(0x%lx) db process stage1 timeout. issue 0x%x send 0x%x.\n",
-                                       hr_qp->qpn, sdb_issue_ptr,
-                                       sdb_send_ptr);
-                               return 0;
-                       }
-
-                       msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-                       sdb_send_ptr = roce_read(hr_dev,
-                                                ROCEE_SDB_SEND_PTR_REG);
-               }
-
-               sdb_send_ptr_tmp = cpu_to_le32(sdb_send_ptr);
-               sdb_issue_ptr_tmp = cpu_to_le32(sdb_issue_ptr);
-               if (roce_get_field(sdb_issue_ptr_tmp,
-                                  ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_M,
-                                  ROCEE_SDB_ISSUE_PTR_SDB_ISSUE_PTR_S) ==
-                   roce_get_field(sdb_send_ptr_tmp,
-                                  ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_M,
-                                  ROCEE_SDB_SEND_PTR_SDB_SEND_PTR_S)) {
-                       old_send = roce_read(hr_dev, ROCEE_SDB_SEND_PTR_REG);
-                       old_retry = roce_read(hr_dev, ROCEE_SDB_RETRY_CNT_REG);
-
-                       do {
-                               tsp_st = roce_read(hr_dev, ROCEE_TSP_BP_ST_REG);
-                               tmp = cpu_to_le32(tsp_st);
-                               if (roce_get_bit(tmp,
-                                       ROCEE_TSP_BP_ST_QH_FIFO_ENTRY_S) == 1) {
-                                       *wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
-                                       return 0;
-                               }
-
-                               if (!time_before(jiffies, end)) {
-                                       dev_dbg(dev, "QP(0x%lx) db process stage1 timeout when send ptr equals issue ptr.\n"
-                                                    "issue 0x%x send 0x%x.\n",
-                                               hr_qp->qpn,
-                                               le32_to_cpu(sdb_issue_ptr_tmp),
-                                               le32_to_cpu(sdb_send_ptr_tmp));
-                                       return 0;
-                               }
-
-                               msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-
-                               hns_roce_check_sdb_status(hr_dev, &old_send,
-                                                         &old_retry, &tsp_st,
-                                                         &success_flags);
-                       } while (!success_flags);
-               }
-
-               *wait_stage = HNS_ROCE_V1_DB_STAGE2;
-
-               /* Get list pointer */
-               *sdb_inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-               dev_dbg(dev, "QP(0x%lx) db process stage2. inv cnt = 0x%x.\n",
-                       hr_qp->qpn, *sdb_inv_cnt);
-       }
-
-       if (*wait_stage == HNS_ROCE_V1_DB_STAGE2) {
-               /* Query db's list status, until hw reversal */
-               inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-               while (roce_hw_index_cmp_lt(inv_cnt,
-                                           *sdb_inv_cnt + SDB_INV_CNT_OFFSET,
-                                           ROCEE_SDB_CNT_CMP_BITS)) {
-                       if (!time_before(jiffies, end)) {
-                               dev_dbg(dev, "QP(0x%lx) db process stage2 timeout. inv cnt 0x%x.\n",
-                                       hr_qp->qpn, inv_cnt);
-                               return 0;
-                       }
-
-                       msleep(HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS);
-                       inv_cnt = roce_read(hr_dev, ROCEE_SDB_INV_CNT_REG);
-               }
-
-               *wait_stage = HNS_ROCE_V1_DB_WAIT_OK;
-       }
-
-       return 0;
-}
-
-static int check_qp_reset_state(struct hns_roce_dev *hr_dev,
-                               struct hns_roce_qp *hr_qp,
-                               struct hns_roce_qp_work *qp_work_entry,
-                               int *is_timeout)
-{
-       struct device *dev = &hr_dev->pdev->dev;
-       u32 sdb_issue_ptr;
-       int ret;
-
-       if (hr_qp->state != IB_QPS_RESET) {
-               /* Set qp to ERR, waiting for hw complete processing all dbs */
-               ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-                                           IB_QPS_ERR);
-               if (ret) {
-                       dev_err(dev, "Modify QP(0x%lx) to ERR failed!\n",
-                               hr_qp->qpn);
-                       return ret;
-               }
-
-               /* Record issued doorbell */
-               sdb_issue_ptr = roce_read(hr_dev, ROCEE_SDB_ISSUE_PTR_REG);
-               qp_work_entry->sdb_issue_ptr = sdb_issue_ptr;
-               qp_work_entry->db_wait_stage = HNS_ROCE_V1_DB_STAGE1;
-
-               /* Query db process status, until hw process completely */
-               ret = check_qp_db_process_status(hr_dev, hr_qp, sdb_issue_ptr,
-                                                &qp_work_entry->sdb_inv_cnt,
-                                                &qp_work_entry->db_wait_stage);
-               if (ret) {
-                       dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
-                               hr_qp->qpn);
-                       return ret;
-               }
-
-               if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK) {
-                       qp_work_entry->sche_cnt = 0;
-                       *is_timeout = 1;
-                       return 0;
-               }
-
-               /* Modify qp to reset before destroying qp */
-               ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-                                           IB_QPS_RESET);
-               if (ret) {
-                       dev_err(dev, "Modify QP(0x%lx) to RST failed!\n",
-                               hr_qp->qpn);
-                       return ret;
-               }
-       }
-
-       return 0;
-}
-
-static void hns_roce_v1_destroy_qp_work_fn(struct work_struct *work)
-{
-       struct hns_roce_qp_work *qp_work_entry;
-       struct hns_roce_v1_priv *priv;
-       struct hns_roce_dev *hr_dev;
-       struct hns_roce_qp *hr_qp;
-       struct device *dev;
-       unsigned long qpn;
-       int ret;
-
-       qp_work_entry = container_of(work, struct hns_roce_qp_work, work);
-       hr_dev = to_hr_dev(qp_work_entry->ib_dev);
-       dev = &hr_dev->pdev->dev;
-       priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-       hr_qp = qp_work_entry->qp;
-       qpn = hr_qp->qpn;
-
-       dev_dbg(dev, "Schedule destroy QP(0x%lx) work.\n", qpn);
-
-       qp_work_entry->sche_cnt++;
-
-       /* Query db process status, until hw process completely */
-       ret = check_qp_db_process_status(hr_dev, hr_qp,
-                                        qp_work_entry->sdb_issue_ptr,
-                                        &qp_work_entry->sdb_inv_cnt,
-                                        &qp_work_entry->db_wait_stage);
-       if (ret) {
-               dev_err(dev, "Check QP(0x%lx) db process status failed!\n",
-                       qpn);
-               return;
-       }
-
-       if (qp_work_entry->db_wait_stage != HNS_ROCE_V1_DB_WAIT_OK &&
-           priv->des_qp.requeue_flag) {
-               queue_work(priv->des_qp.qp_wq, work);
-               return;
-       }
-
-       /* Modify qp to reset before destroying qp */
-       ret = hns_roce_v1_modify_qp(&hr_qp->ibqp, NULL, 0, hr_qp->state,
-                                   IB_QPS_RESET);
-       if (ret) {
-               dev_err(dev, "Modify QP(0x%lx) to RST failed!\n", qpn);
-               return;
-       }
-
-       hns_roce_qp_remove(hr_dev, hr_qp);
-       hns_roce_qp_free(hr_dev, hr_qp);
-
-       if (hr_qp->ibqp.qp_type == IB_QPT_RC) {
-               /* RC QP, release QPN */
-               hns_roce_release_range_qp(hr_dev, qpn, 1);
-               kfree(hr_qp);
-       } else
-               kfree(hr_to_hr_sqp(hr_qp));
-
-       kfree(qp_work_entry);
-
-       dev_dbg(dev, "Accomplished destroy QP(0x%lx) work.\n", qpn);
-}
-
-int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
        struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
-       struct device *dev = &hr_dev->pdev->dev;
-       struct hns_roce_qp_work qp_work_entry;
-       struct hns_roce_qp_work *qp_work;
-       struct hns_roce_v1_priv *priv;
        struct hns_roce_cq *send_cq, *recv_cq;
-       bool is_user = ibqp->uobject;
-       int is_timeout = 0;
        int ret;
 
-       ret = check_qp_reset_state(hr_dev, hr_qp, &qp_work_entry, &is_timeout);
-       if (ret) {
-               dev_err(dev, "QP reset state check failed(%d)!\n", ret);
+       ret = hns_roce_v1_modify_qp(ibqp, NULL, 0, hr_qp->state, IB_QPS_RESET);
+       if (ret)
                return ret;
-       }
 
        send_cq = to_hr_cq(hr_qp->ibqp.send_cq);
        recv_cq = to_hr_cq(hr_qp->ibqp.recv_cq);
 
        hns_roce_lock_cqs(send_cq, recv_cq);
-       if (!is_user) {
+       if (!udata) {
                __hns_roce_v1_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
                                       to_hr_srq(hr_qp->ibqp.srq) : NULL);
                if (send_cq != recv_cq)
@@ -3950,18 +3623,16 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
        }
        hns_roce_unlock_cqs(send_cq, recv_cq);
 
-       if (!is_timeout) {
-               hns_roce_qp_remove(hr_dev, hr_qp);
-               hns_roce_qp_free(hr_dev, hr_qp);
+       hns_roce_qp_remove(hr_dev, hr_qp);
+       hns_roce_qp_free(hr_dev, hr_qp);
 
-               /* RC QP, release QPN */
-               if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-                       hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
-       }
+       /* RC QP, release QPN */
+       if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+               hns_roce_release_range_qp(hr_dev, hr_qp->qpn, 1);
 
        hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
 
-       if (is_user)
+       if (udata)
                ib_umem_release(hr_qp->umem);
        else {
                kfree(hr_qp->sq.wrid);
@@ -3970,33 +3641,14 @@ int hns_roce_v1_destroy_qp(struct ib_qp *ibqp)
                hns_roce_buf_free(hr_dev, hr_qp->buff_size, &hr_qp->hr_buf);
        }
 
-       if (!is_timeout) {
-               if (hr_qp->ibqp.qp_type == IB_QPT_RC)
-                       kfree(hr_qp);
-               else
-                       kfree(hr_to_hr_sqp(hr_qp));
-       } else {
-               qp_work = kzalloc(sizeof(*qp_work), GFP_KERNEL);
-               if (!qp_work)
-                       return -ENOMEM;
-
-               INIT_WORK(&qp_work->work, hns_roce_v1_destroy_qp_work_fn);
-               qp_work->ib_dev = &hr_dev->ib_dev;
-               qp_work->qp             = hr_qp;
-               qp_work->db_wait_stage  = qp_work_entry.db_wait_stage;
-               qp_work->sdb_issue_ptr  = qp_work_entry.sdb_issue_ptr;
-               qp_work->sdb_inv_cnt    = qp_work_entry.sdb_inv_cnt;
-               qp_work->sche_cnt       = qp_work_entry.sche_cnt;
-
-               priv = (struct hns_roce_v1_priv *)hr_dev->priv;
-               queue_work(priv->des_qp.qp_wq, &qp_work->work);
-               dev_dbg(dev, "Begin destroy QP(0x%lx) work.\n", hr_qp->qpn);
-       }
-
+       if (hr_qp->ibqp.qp_type == IB_QPT_RC)
+               kfree(hr_qp);
+       else
+               kfree(hr_to_hr_sqp(hr_qp));
        return 0;
 }
 
-static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq)
+static int hns_roce_v1_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibcq->device);
        struct hns_roce_cq *hr_cq = to_hr_cq(ibcq);
index 6644014..52307b2 100644 (file)
 #define HNS_ROCE_V1_EXT_ODB_ALFUL      \
        (HNS_ROCE_V1_EXT_ODB_DEPTH - HNS_ROCE_V1_DB_RSVD)
 
-#define HNS_ROCE_V1_DB_WAIT_OK                         0
-#define HNS_ROCE_V1_DB_STAGE1                          1
-#define HNS_ROCE_V1_DB_STAGE2                          2
-#define HNS_ROCE_V1_CHECK_DB_TIMEOUT_MSECS             10000
-#define HNS_ROCE_V1_CHECK_DB_SLEEP_MSECS               20
 #define HNS_ROCE_V1_FREE_MR_TIMEOUT_MSECS              50000
 #define HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS       10000
 #define HNS_ROCE_V1_FREE_MR_WAIT_VALUE                 5
 #define SQ_PSN_SHIFT                                   8
 #define QKEY_VAL                                       0x80010000
 #define SDB_INV_CNT_OFFSET                             8
-#define SDB_ST_CMP_VAL                                 8
 
 #define HNS_ROCE_CEQ_DEFAULT_INTERVAL                  0x10
 #define HNS_ROCE_CEQ_DEFAULT_BURST_NUM                 0x10
@@ -1068,11 +1062,6 @@ struct hns_roce_qp_work {
        u32     sche_cnt;
 };
 
-struct hns_roce_des_qp {
-       struct workqueue_struct *qp_wq;
-       int     requeue_flag;
-};
-
 struct hns_roce_mr_free_work {
        struct  work_struct work;
        struct  ib_device *ib_dev;
@@ -1100,12 +1089,11 @@ struct hns_roce_v1_priv {
        struct hns_roce_raq_table raq_table;
        struct hns_roce_bt_table  bt_table;
        struct hns_roce_tptr_table tptr_table;
-       struct hns_roce_des_qp des_qp;
        struct hns_roce_free_mr free_mr;
 };
 
 int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool dereset);
 int hns_roce_v1_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
-int hns_roce_v1_destroy_qp(struct ib_qp *ibqp);
+int hns_roce_v1_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 
 #endif
index 1c54390..b5392cb 100644 (file)
@@ -37,7 +37,9 @@
 #include <linux/types.h>
 #include <net/addrconf.h>
 #include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
 #include <rdma/ib_umem.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "hnae3.h"
 #include "hns_roce_common.h"
@@ -1086,7 +1088,7 @@ static int __hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
        return ret;
 }
 
-int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
+static int hns_roce_cmq_send(struct hns_roce_dev *hr_dev,
                             struct hns_roce_cmq_desc *desc, int num)
 {
        int retval;
@@ -1559,7 +1561,7 @@ static int hns_roce_v2_profile(struct hns_roce_dev *hr_dev)
        caps->qpc_hop_num       = HNS_ROCE_CONTEXT_HOP_NUM;
        caps->srqc_ba_pg_sz     = 0;
        caps->srqc_buf_pg_sz    = 0;
-       caps->srqc_hop_num      = HNS_ROCE_HOP_NUM_0;
+       caps->srqc_hop_num      = HNS_ROCE_CONTEXT_HOP_NUM;
        caps->cqc_ba_pg_sz      = 0;
        caps->cqc_buf_pg_sz     = 0;
        caps->cqc_hop_num       = HNS_ROCE_CONTEXT_HOP_NUM;
@@ -2150,7 +2152,7 @@ static int hns_roce_v2_write_mtpt(void *mb_buf, struct hns_roce_mr *mr,
                       V2_MPT_BYTE_4_PD_S, mr->pd);
 
        roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_RA_EN_S, 0);
-       roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 1);
+       roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_R_INV_EN_S, 0);
        roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_L_INV_EN_S, 1);
        roce_set_bit(mpt_entry->byte_8_mw_cnt_en, V2_MPT_BYTE_8_BIND_EN_S,
                     (mr->access & IB_ACCESS_MW_BIND ? 1 : 0));
@@ -3171,12 +3173,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
        roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CNP_TX_FLAG_S, 0);
        roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_CE_FLAG_S, 0);
 
-       if (attr_mask & IB_QP_QKEY) {
-               context->qkey_xrcd = attr->qkey;
-               qpc_mask->qkey_xrcd = 0;
-               hr_qp->qkey = attr->qkey;
-       }
-
        if (hr_qp->rdb_en) {
                roce_set_bit(context->byte_68_rq_db,
                             V2_QPC_BYTE_68_RQ_RECORD_EN_S, 1);
@@ -3388,7 +3384,6 @@ static void modify_qp_reset_to_init(struct ib_qp *ibqp,
                     0);
 
        hr_qp->access_flags = attr->qp_access_flags;
-       hr_qp->pkey_index = attr->pkey_index;
        roce_set_field(context->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
                       V2_QPC_BYTE_252_TX_CQN_S, to_hr_cq(ibqp->send_cq)->cqn);
        roce_set_field(qpc_mask->byte_252_err_txcqn, V2_QPC_BYTE_252_TX_CQN_M,
@@ -3512,11 +3507,6 @@ static void modify_qp_init_to_init(struct ib_qp *ibqp,
                               V2_QPC_BYTE_76_SRQN_M, V2_QPC_BYTE_76_SRQN_S, 0);
        }
 
-       if (attr_mask & IB_QP_QKEY) {
-               context->qkey_xrcd = attr->qkey;
-               qpc_mask->qkey_xrcd = 0;
-       }
-
        roce_set_field(context->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
                       V2_QPC_BYTE_4_SQPN_S, hr_qp->qpn);
        roce_set_field(qpc_mask->byte_4_sqpn_tst, V2_QPC_BYTE_4_SQPN_M,
@@ -3636,13 +3626,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
                       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_M,
                       V2_QPC_BYTE_16_WQE_SGE_BUF_PG_SZ_S, 0);
 
-       roce_set_field(context->byte_80_rnr_rx_cqn,
-                      V2_QPC_BYTE_80_MIN_RNR_TIME_M,
-                      V2_QPC_BYTE_80_MIN_RNR_TIME_S, attr->min_rnr_timer);
-       roce_set_field(qpc_mask->byte_80_rnr_rx_cqn,
-                      V2_QPC_BYTE_80_MIN_RNR_TIME_M,
-                      V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
-
        page_size = 1 << (hr_dev->caps.mtt_buf_pg_sz + PAGE_SHIFT);
        context->rq_cur_blk_addr = (u32)(mtts[hr_qp->rq.offset / page_size]
                                    >> PAGE_ADDR_SHIFT);
@@ -3670,13 +3653,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_M,
                       V2_QPC_BYTE_104_RQ_NXT_BLK_ADDR_S, 0);
 
-       roce_set_field(context->byte_108_rx_reqepsn,
-                      V2_QPC_BYTE_108_RX_REQ_EPSN_M,
-                      V2_QPC_BYTE_108_RX_REQ_EPSN_S, attr->rq_psn);
-       roce_set_field(qpc_mask->byte_108_rx_reqepsn,
-                      V2_QPC_BYTE_108_RX_REQ_EPSN_M,
-                      V2_QPC_BYTE_108_RX_REQ_EPSN_S, 0);
-
        roce_set_field(context->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
                       V2_QPC_BYTE_132_TRRL_BA_S, dma_handle_3 >> 4);
        roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_BA_M,
@@ -3715,15 +3691,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
                roce_set_bit(qpc_mask->byte_28_at_fl, V2_QPC_BYTE_28_LBI_S, 0);
        }
 
-       if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
-            attr->max_dest_rd_atomic) {
-               roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-                              V2_QPC_BYTE_140_RR_MAX_S,
-                              fls(attr->max_dest_rd_atomic - 1));
-               roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
-                              V2_QPC_BYTE_140_RR_MAX_S, 0);
-       }
-
        if (attr_mask & IB_QP_DEST_QPN) {
                roce_set_field(context->byte_56_dqpn_err, V2_QPC_BYTE_56_DQPN_M,
                               V2_QPC_BYTE_56_DQPN_S, attr->dest_qp_num);
@@ -3784,11 +3751,6 @@ static int modify_qp_init_to_rtr(struct ib_qp *ibqp,
        context->rq_rnr_timer = 0;
        qpc_mask->rq_rnr_timer = 0;
 
-       roce_set_field(context->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-                      V2_QPC_BYTE_152_RAQ_PSN_S, attr->rq_psn - 1);
-       roce_set_field(qpc_mask->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
-                      V2_QPC_BYTE_152_RAQ_PSN_S, 0);
-
        roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_HEAD_MAX_M,
                       V2_QPC_BYTE_132_TRRL_HEAD_MAX_S, 0);
        roce_set_field(qpc_mask->byte_132_trrl, V2_QPC_BYTE_132_TRRL_TAIL_MAX_M,
@@ -3886,13 +3848,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
                       V2_QPC_BYTE_240_RX_ACK_MSN_M,
                       V2_QPC_BYTE_240_RX_ACK_MSN_S, 0);
 
-       roce_set_field(context->byte_244_rnr_rxack,
-                      V2_QPC_BYTE_244_RX_ACK_EPSN_M,
-                      V2_QPC_BYTE_244_RX_ACK_EPSN_S, attr->sq_psn);
-       roce_set_field(qpc_mask->byte_244_rnr_rxack,
-                      V2_QPC_BYTE_244_RX_ACK_EPSN_M,
-                      V2_QPC_BYTE_244_RX_ACK_EPSN_S, 0);
-
        roce_set_field(qpc_mask->byte_248_ack_psn,
                       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_M,
                       V2_QPC_BYTE_248_ACK_LAST_OPTYPE_S, 0);
@@ -3906,27 +3861,6 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
                       V2_QPC_BYTE_240_IRRL_TAIL_REAL_M,
                       V2_QPC_BYTE_240_IRRL_TAIL_REAL_S, 0);
 
-       roce_set_field(context->byte_220_retry_psn_msn,
-                      V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
-                      V2_QPC_BYTE_220_RETRY_MSG_PSN_S, attr->sq_psn);
-       roce_set_field(qpc_mask->byte_220_retry_psn_msn,
-                      V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
-                      V2_QPC_BYTE_220_RETRY_MSG_PSN_S, 0);
-
-       roce_set_field(context->byte_224_retry_msg,
-                      V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
-                      V2_QPC_BYTE_224_RETRY_MSG_PSN_S, attr->sq_psn >> 16);
-       roce_set_field(qpc_mask->byte_224_retry_msg,
-                      V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
-                      V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
-
-       roce_set_field(context->byte_224_retry_msg,
-                      V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
-                      V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, attr->sq_psn);
-       roce_set_field(qpc_mask->byte_224_retry_msg,
-                      V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
-                      V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, 0);
-
        roce_set_field(qpc_mask->byte_220_retry_psn_msn,
                       V2_QPC_BYTE_220_RETRY_MSG_MSN_M,
                       V2_QPC_BYTE_220_RETRY_MSG_MSN_S, 0);
@@ -3937,66 +3871,14 @@ static int modify_qp_rtr_to_rts(struct ib_qp *ibqp,
        roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_CHECK_FLG_M,
                       V2_QPC_BYTE_212_CHECK_FLG_S, 0);
 
-       roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
-                      V2_QPC_BYTE_212_RETRY_CNT_S, attr->retry_cnt);
-       roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_CNT_M,
-                      V2_QPC_BYTE_212_RETRY_CNT_S, 0);
-
-       roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
-                      V2_QPC_BYTE_212_RETRY_NUM_INIT_S, attr->retry_cnt);
-       roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
-                      V2_QPC_BYTE_212_RETRY_NUM_INIT_S, 0);
-
-       roce_set_field(context->byte_244_rnr_rxack,
-                      V2_QPC_BYTE_244_RNR_NUM_INIT_M,
-                      V2_QPC_BYTE_244_RNR_NUM_INIT_S, attr->rnr_retry);
-       roce_set_field(qpc_mask->byte_244_rnr_rxack,
-                      V2_QPC_BYTE_244_RNR_NUM_INIT_M,
-                      V2_QPC_BYTE_244_RNR_NUM_INIT_S, 0);
-
-       roce_set_field(context->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
-                      V2_QPC_BYTE_244_RNR_CNT_S, attr->rnr_retry);
-       roce_set_field(qpc_mask->byte_244_rnr_rxack, V2_QPC_BYTE_244_RNR_CNT_M,
-                      V2_QPC_BYTE_244_RNR_CNT_S, 0);
-
        roce_set_field(context->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
                       V2_QPC_BYTE_212_LSN_S, 0x100);
        roce_set_field(qpc_mask->byte_212_lsn, V2_QPC_BYTE_212_LSN_M,
                       V2_QPC_BYTE_212_LSN_S, 0);
 
-       if (attr_mask & IB_QP_TIMEOUT) {
-               if (attr->timeout < 31) {
-                       roce_set_field(context->byte_28_at_fl,
-                                      V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
-                                      attr->timeout);
-                       roce_set_field(qpc_mask->byte_28_at_fl,
-                                      V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
-                                      0);
-               } else {
-                       dev_warn(dev, "Local ACK timeout shall be 0 to 30.\n");
-               }
-       }
-
-       roce_set_field(context->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
-                      V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn);
-       roce_set_field(qpc_mask->byte_172_sq_psn, V2_QPC_BYTE_172_SQ_CUR_PSN_M,
-                      V2_QPC_BYTE_172_SQ_CUR_PSN_S, 0);
-
        roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_IRRL_HEAD_M,
                       V2_QPC_BYTE_196_IRRL_HEAD_S, 0);
-       roce_set_field(context->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
-                      V2_QPC_BYTE_196_SQ_MAX_PSN_S, attr->sq_psn);
-       roce_set_field(qpc_mask->byte_196_sq_psn, V2_QPC_BYTE_196_SQ_MAX_PSN_M,
-                      V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
 
-       if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
-               roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
-                              V2_QPC_BYTE_208_SR_MAX_S,
-                              fls(attr->max_rd_atomic - 1));
-               roce_set_field(qpc_mask->byte_208_irrl,
-                              V2_QPC_BYTE_208_SR_MAX_M,
-                              V2_QPC_BYTE_208_SR_MAX_S, 0);
-       }
        return 0;
 }
 
@@ -4090,7 +3972,6 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                const struct ib_global_route *grh =
                                            rdma_ah_read_grh(&attr->ah_attr);
                const struct ib_gid_attr *gid_attr = NULL;
-               u8 src_mac[ETH_ALEN];
                int is_roce_protocol;
                u16 vlan = 0xffff;
                u8 ib_port;
@@ -4104,11 +3985,12 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
 
                if (is_roce_protocol) {
                        gid_attr = attr->ah_attr.grh.sgid_attr;
-                       vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev);
-                       memcpy(src_mac, gid_attr->ndev->dev_addr, ETH_ALEN);
+                       ret = rdma_read_gid_l2_fields(gid_attr, &vlan, NULL);
+                       if (ret)
+                               goto out;
                }
 
-               if (is_vlan_dev(gid_attr->ndev)) {
+               if (vlan < VLAN_CFI_MASK) {
                        roce_set_bit(context->byte_76_srqn_op_en,
                                     V2_QPC_BYTE_76_RQ_VLAN_EN_S, 1);
                        roce_set_bit(qpc_mask->byte_76_srqn_op_en,
@@ -4190,9 +4072,152 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
                hr_qp->sl = rdma_ah_get_sl(&attr->ah_attr);
        }
 
+       if (attr_mask & IB_QP_TIMEOUT) {
+               if (attr->timeout < 31) {
+                       roce_set_field(context->byte_28_at_fl,
+                                      V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+                                      attr->timeout);
+                       roce_set_field(qpc_mask->byte_28_at_fl,
+                                      V2_QPC_BYTE_28_AT_M, V2_QPC_BYTE_28_AT_S,
+                                      0);
+               } else {
+                       dev_warn(dev, "Local ACK timeout shall be 0 to 30.\n");
+               }
+       }
+
+       if (attr_mask & IB_QP_RETRY_CNT) {
+               roce_set_field(context->byte_212_lsn,
+                              V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+                              V2_QPC_BYTE_212_RETRY_NUM_INIT_S,
+                              attr->retry_cnt);
+               roce_set_field(qpc_mask->byte_212_lsn,
+                              V2_QPC_BYTE_212_RETRY_NUM_INIT_M,
+                              V2_QPC_BYTE_212_RETRY_NUM_INIT_S, 0);
+
+               roce_set_field(context->byte_212_lsn,
+                              V2_QPC_BYTE_212_RETRY_CNT_M,
+                              V2_QPC_BYTE_212_RETRY_CNT_S,
+                              attr->retry_cnt);
+               roce_set_field(qpc_mask->byte_212_lsn,
+                              V2_QPC_BYTE_212_RETRY_CNT_M,
+                              V2_QPC_BYTE_212_RETRY_CNT_S, 0);
+       }
+
+       if (attr_mask & IB_QP_RNR_RETRY) {
+               roce_set_field(context->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+                              V2_QPC_BYTE_244_RNR_NUM_INIT_S, attr->rnr_retry);
+               roce_set_field(qpc_mask->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RNR_NUM_INIT_M,
+                              V2_QPC_BYTE_244_RNR_NUM_INIT_S, 0);
+
+               roce_set_field(context->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RNR_CNT_M,
+                              V2_QPC_BYTE_244_RNR_CNT_S, attr->rnr_retry);
+               roce_set_field(qpc_mask->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RNR_CNT_M,
+                              V2_QPC_BYTE_244_RNR_CNT_S, 0);
+       }
+
+       if (attr_mask & IB_QP_SQ_PSN) {
+               roce_set_field(context->byte_172_sq_psn,
+                              V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+                              V2_QPC_BYTE_172_SQ_CUR_PSN_S, attr->sq_psn);
+               roce_set_field(qpc_mask->byte_172_sq_psn,
+                              V2_QPC_BYTE_172_SQ_CUR_PSN_M,
+                              V2_QPC_BYTE_172_SQ_CUR_PSN_S, 0);
+
+               roce_set_field(context->byte_196_sq_psn,
+                              V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+                              V2_QPC_BYTE_196_SQ_MAX_PSN_S, attr->sq_psn);
+               roce_set_field(qpc_mask->byte_196_sq_psn,
+                              V2_QPC_BYTE_196_SQ_MAX_PSN_M,
+                              V2_QPC_BYTE_196_SQ_MAX_PSN_S, 0);
+
+               roce_set_field(context->byte_220_retry_psn_msn,
+                              V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+                              V2_QPC_BYTE_220_RETRY_MSG_PSN_S, attr->sq_psn);
+               roce_set_field(qpc_mask->byte_220_retry_psn_msn,
+                              V2_QPC_BYTE_220_RETRY_MSG_PSN_M,
+                              V2_QPC_BYTE_220_RETRY_MSG_PSN_S, 0);
+
+               roce_set_field(context->byte_224_retry_msg,
+                              V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+                              V2_QPC_BYTE_224_RETRY_MSG_PSN_S,
+                              attr->sq_psn >> 16);
+               roce_set_field(qpc_mask->byte_224_retry_msg,
+                              V2_QPC_BYTE_224_RETRY_MSG_PSN_M,
+                              V2_QPC_BYTE_224_RETRY_MSG_PSN_S, 0);
+
+               roce_set_field(context->byte_224_retry_msg,
+                              V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+                              V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S,
+                              attr->sq_psn);
+               roce_set_field(qpc_mask->byte_224_retry_msg,
+                              V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_M,
+                              V2_QPC_BYTE_224_RETRY_MSG_FPKT_PSN_S, 0);
+
+               roce_set_field(context->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+                              V2_QPC_BYTE_244_RX_ACK_EPSN_S, attr->sq_psn);
+               roce_set_field(qpc_mask->byte_244_rnr_rxack,
+                              V2_QPC_BYTE_244_RX_ACK_EPSN_M,
+                              V2_QPC_BYTE_244_RX_ACK_EPSN_S, 0);
+       }
+
+       if ((attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) &&
+            attr->max_dest_rd_atomic) {
+               roce_set_field(context->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+                              V2_QPC_BYTE_140_RR_MAX_S,
+                              fls(attr->max_dest_rd_atomic - 1));
+               roce_set_field(qpc_mask->byte_140_raq, V2_QPC_BYTE_140_RR_MAX_M,
+                              V2_QPC_BYTE_140_RR_MAX_S, 0);
+       }
+
+       if ((attr_mask & IB_QP_MAX_QP_RD_ATOMIC) && attr->max_rd_atomic) {
+               roce_set_field(context->byte_208_irrl, V2_QPC_BYTE_208_SR_MAX_M,
+                              V2_QPC_BYTE_208_SR_MAX_S,
+                              fls(attr->max_rd_atomic - 1));
+               roce_set_field(qpc_mask->byte_208_irrl,
+                              V2_QPC_BYTE_208_SR_MAX_M,
+                              V2_QPC_BYTE_208_SR_MAX_S, 0);
+       }
+
        if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
                set_access_flags(hr_qp, context, qpc_mask, attr, attr_mask);
 
+       if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+               roce_set_field(context->byte_80_rnr_rx_cqn,
+                              V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+                              V2_QPC_BYTE_80_MIN_RNR_TIME_S,
+                              attr->min_rnr_timer);
+               roce_set_field(qpc_mask->byte_80_rnr_rx_cqn,
+                              V2_QPC_BYTE_80_MIN_RNR_TIME_M,
+                              V2_QPC_BYTE_80_MIN_RNR_TIME_S, 0);
+       }
+
+       /* RC&UC required attr */
+       if (attr_mask & IB_QP_RQ_PSN) {
+               roce_set_field(context->byte_108_rx_reqepsn,
+                              V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+                              V2_QPC_BYTE_108_RX_REQ_EPSN_S, attr->rq_psn);
+               roce_set_field(qpc_mask->byte_108_rx_reqepsn,
+                              V2_QPC_BYTE_108_RX_REQ_EPSN_M,
+                              V2_QPC_BYTE_108_RX_REQ_EPSN_S, 0);
+
+               roce_set_field(context->byte_152_raq, V2_QPC_BYTE_152_RAQ_PSN_M,
+                              V2_QPC_BYTE_152_RAQ_PSN_S, attr->rq_psn - 1);
+               roce_set_field(qpc_mask->byte_152_raq,
+                              V2_QPC_BYTE_152_RAQ_PSN_M,
+                              V2_QPC_BYTE_152_RAQ_PSN_S, 0);
+       }
+
+       if (attr_mask & IB_QP_QKEY) {
+               context->qkey_xrcd = attr->qkey;
+               qpc_mask->qkey_xrcd = 0;
+               hr_qp->qkey = attr->qkey;
+       }
+
        roce_set_bit(context->byte_108_rx_reqepsn, V2_QPC_BYTE_108_INV_CREDIT_S,
                     ibqp->srq ? 1 : 0);
        roce_set_bit(qpc_mask->byte_108_rx_reqepsn,
@@ -4421,7 +4446,7 @@ out:
 
 static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
                                         struct hns_roce_qp *hr_qp,
-                                        bool is_user)
+                                        struct ib_udata *udata)
 {
        struct hns_roce_cq *send_cq, *recv_cq;
        struct device *dev = hr_dev->dev;
@@ -4443,7 +4468,7 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 
        hns_roce_lock_cqs(send_cq, recv_cq);
 
-       if (!is_user) {
+       if (!udata) {
                __hns_roce_v2_cq_clean(recv_cq, hr_qp->qpn, hr_qp->ibqp.srq ?
                                       to_hr_srq(hr_qp->ibqp.srq) : NULL);
                if (send_cq != recv_cq)
@@ -4464,16 +4489,18 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
 
        hns_roce_mtt_cleanup(hr_dev, &hr_qp->mtt);
 
-       if (is_user) {
+       if (udata) {
+               struct hns_roce_ucontext *context =
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct hns_roce_ucontext,
+                               ibucontext);
+
                if (hr_qp->sq.wqe_cnt && (hr_qp->sdb_en == 1))
-                       hns_roce_db_unmap_user(
-                               to_hr_ucontext(hr_qp->ibqp.uobject->context),
-                               &hr_qp->sdb);
+                       hns_roce_db_unmap_user(context, &hr_qp->sdb);
 
                if (hr_qp->rq.wqe_cnt && (hr_qp->rdb_en == 1))
-                       hns_roce_db_unmap_user(
-                               to_hr_ucontext(hr_qp->ibqp.uobject->context),
-                               &hr_qp->rdb);
+                       hns_roce_db_unmap_user(context, &hr_qp->rdb);
                ib_umem_release(hr_qp->umem);
        } else {
                kfree(hr_qp->sq.wrid);
@@ -4492,13 +4519,13 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
        return 0;
 }
 
-static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp)
+static int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
        struct hns_roce_qp *hr_qp = to_hr_qp(ibqp);
        int ret;
 
-       ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, ibqp->uobject);
+       ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
        if (ret) {
                dev_err(hr_dev->dev, "Destroy qp failed(%d)\n", ret);
                return ret;
@@ -6044,6 +6071,10 @@ static int hns_roce_v2_post_srq_recv(struct ib_srq *ibsrq,
        return ret;
 }
 
+static const struct hns_roce_dfx_hw hns_roce_dfx_hw_v2 = {
+       .query_cqc_info = hns_roce_v2_query_cqc_info,
+};
+
 static const struct ib_device_ops hns_roce_v2_dev_ops = {
        .destroy_qp = hns_roce_v2_destroy_qp,
        .modify_cq = hns_roce_v2_modify_cq,
@@ -6113,16 +6144,10 @@ static int hns_roce_hw_v2_get_cfg(struct hns_roce_dev *hr_dev,
                                  struct hnae3_handle *handle)
 {
        struct hns_roce_v2_priv *priv = hr_dev->priv;
-       const struct pci_device_id *id;
        int i;
 
-       id = pci_match_id(hns_roce_hw_v2_pci_tbl, hr_dev->pci_dev);
-       if (!id) {
-               dev_err(hr_dev->dev, "device is not compatible!\n");
-               return -ENXIO;
-       }
-
        hr_dev->hw = &hns_roce_hw_v2;
+       hr_dev->dfx = &hns_roce_dfx_hw_v2;
        hr_dev->sdb_offset = ROCEE_DB_SQ_L_0_REG;
        hr_dev->odb_offset = hr_dev->sdb_offset;
 
@@ -6209,6 +6234,7 @@ static void __hns_roce_hw_v2_uninit_instance(struct hnae3_handle *handle,
 static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
 {
        const struct hnae3_ae_ops *ops = handle->ae_algo->ops;
+       const struct pci_device_id *id;
        struct device *dev = &handle->pdev->dev;
        int ret;
 
@@ -6219,6 +6245,10 @@ static int hns_roce_hw_v2_init_instance(struct hnae3_handle *handle)
                goto reset_chk_err;
        }
 
+       id = pci_match_id(hns_roce_hw_v2_pci_tbl, handle->pdev);
+       if (!id)
+               return 0;
+
        ret = __hns_roce_hw_v2_init_instance(handle);
        if (ret) {
                handle->rinfo.instance_state = HNS_ROCE_STATE_NON_INIT;
index f1f1b75..edfdbe2 100644 (file)
@@ -719,8 +719,8 @@ struct hns_roce_v2_qp_context {
 #define        V2_QPC_BYTE_148_RAQ_SYNDROME_S 24
 #define V2_QPC_BYTE_148_RAQ_SYNDROME_M GENMASK(31, 24)
 
-#define        V2_QPC_BYTE_152_RAQ_PSN_S 8
-#define V2_QPC_BYTE_152_RAQ_PSN_M GENMASK(31, 8)
+#define        V2_QPC_BYTE_152_RAQ_PSN_S 0
+#define V2_QPC_BYTE_152_RAQ_PSN_M GENMASK(23, 0)
 
 #define        V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_S 24
 #define V2_QPC_BYTE_152_RAQ_TRRL_RTY_HEAD_M GENMASK(31, 24)
@@ -1799,6 +1799,9 @@ struct hns_roce_sccc_clr_done {
        __le32 rsv[5];
 };
 
+int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn,
+                              int *buffer);
+
 static inline void hns_roce_write64(struct hns_roce_dev *hr_dev, __le32 val[2],
                                    void __iomem *dest)
 {
diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c b/drivers/infiniband/hw/hns/hns_roce_hw_v2_dfx.c
new file mode 100644 (file)
index 0000000..5a97b5a
--- /dev/null
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+// Copyright (c) 2019 Hisilicon Limited.
+
+#include "hnae3.h"
+#include "hns_roce_device.h"
+#include "hns_roce_cmd.h"
+#include "hns_roce_hw_v2.h"
+
+int hns_roce_v2_query_cqc_info(struct hns_roce_dev *hr_dev, u32 cqn,
+                              int *buffer)
+{
+       struct hns_roce_v2_cq_context *cq_context;
+       struct hns_roce_cmd_mailbox *mailbox;
+       int ret;
+
+       mailbox = hns_roce_alloc_cmd_mailbox(hr_dev);
+       if (IS_ERR(mailbox))
+               return PTR_ERR(mailbox);
+
+       cq_context = mailbox->buf;
+       ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma, cqn, 0,
+                               HNS_ROCE_CMD_QUERY_CQC,
+                               HNS_ROCE_CMD_TIMEOUT_MSECS);
+       if (ret) {
+               dev_err(hr_dev->dev, "QUERY cqc cmd process error\n");
+               goto err_mailbox;
+       }
+
+       memcpy(buffer, cq_context, sizeof(*cq_context));
+
+err_mailbox:
+       hns_roce_free_cmd_mailbox(hr_dev, mailbox);
+
+       return ret;
+}
index c929125..8da5f18 100644 (file)
@@ -234,25 +234,6 @@ static int hns_roce_query_device(struct ib_device *ib_dev,
        return 0;
 }
 
-static struct net_device *hns_roce_get_netdev(struct ib_device *ib_dev,
-                                             u8 port_num)
-{
-       struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
-       struct net_device *ndev;
-
-       if (port_num < 1 || port_num > hr_dev->caps.num_ports)
-               return NULL;
-
-       rcu_read_lock();
-
-       ndev = hr_dev->iboe.netdevs[port_num - 1];
-       if (ndev)
-               dev_hold(ndev);
-
-       rcu_read_unlock();
-       return ndev;
-}
-
 static int hns_roce_query_port(struct ib_device *ib_dev, u8 port_num,
                               struct ib_port_attr *props)
 {
@@ -455,9 +436,9 @@ static const struct ib_device_ops hns_roce_dev_ops = {
        .destroy_ah = hns_roce_destroy_ah,
        .destroy_cq = hns_roce_ib_destroy_cq,
        .disassociate_ucontext = hns_roce_disassociate_ucontext,
+       .fill_res_entry = hns_roce_fill_res_entry,
        .get_dma_mr = hns_roce_get_dma_mr,
        .get_link_layer = hns_roce_get_link_layer,
-       .get_netdev = hns_roce_get_netdev,
        .get_port_immutable = hns_roce_port_immutable,
        .mmap = hns_roce_mmap,
        .modify_device = hns_roce_modify_device,
@@ -468,6 +449,8 @@ static const struct ib_device_ops hns_roce_dev_ops = {
        .query_pkey = hns_roce_query_pkey,
        .query_port = hns_roce_query_port,
        .reg_user_mr = hns_roce_reg_user_mr,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, hns_roce_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, hns_roce_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, hns_roce_ucontext, ibucontext),
 };
@@ -489,6 +472,8 @@ static const struct ib_device_ops hns_roce_dev_frmr_ops = {
 static const struct ib_device_ops hns_roce_dev_srq_ops = {
        .create_srq = hns_roce_create_srq,
        .destroy_srq = hns_roce_destroy_srq,
+
+       INIT_RDMA_OBJ_SIZE(ib_srq, hns_roce_srq, ibsrq),
 };
 
 static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
@@ -497,6 +482,7 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
        struct hns_roce_ib_iboe *iboe = NULL;
        struct ib_device *ib_dev = NULL;
        struct device *dev = hr_dev->dev;
+       unsigned int i;
 
        iboe = &hr_dev->iboe;
        spin_lock_init(&iboe->lock);
@@ -562,6 +548,15 @@ static int hns_roce_register_device(struct hns_roce_dev *hr_dev)
        ib_dev->driver_id = RDMA_DRIVER_HNS;
        ib_set_device_ops(ib_dev, hr_dev->hw->hns_roce_dev_ops);
        ib_set_device_ops(ib_dev, &hns_roce_dev_ops);
+       for (i = 0; i < hr_dev->caps.num_ports; i++) {
+               if (!hr_dev->iboe.netdevs[i])
+                       continue;
+
+               ret = ib_device_set_netdev(ib_dev, hr_dev->iboe.netdevs[i],
+                                          i + 1);
+               if (ret)
+                       return ret;
+       }
        ret = ib_register_device(ib_dev, "hns_%d");
        if (ret) {
                dev_err(dev, "ib_register_device failed!\n");
index 08be0e4..6110ec4 100644 (file)
@@ -1282,14 +1282,14 @@ free_cmd_mbox:
        return ret;
 }
 
-int hns_roce_dereg_mr(struct ib_mr *ibmr)
+int hns_roce_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibmr->device);
        struct hns_roce_mr *mr = to_hr_mr(ibmr);
        int ret = 0;
 
        if (hr_dev->hw->dereg_mr) {
-               ret = hr_dev->hw->dereg_mr(hr_dev, mr);
+               ret = hr_dev->hw->dereg_mr(hr_dev, mr, udata);
        } else {
                hns_roce_mr_free(hr_dev, mr);
 
@@ -1303,7 +1303,7 @@ int hns_roce_dereg_mr(struct ib_mr *ibmr)
 }
 
 struct ib_mr *hns_roce_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                               u32 max_num_sg)
+                               u32 max_num_sg, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
        struct device *dev = hr_dev->dev;
index b9b97c5..8134013 100644 (file)
@@ -57,8 +57,7 @@ void hns_roce_cleanup_pd_table(struct hns_roce_dev *hr_dev)
        hns_roce_bitmap_cleanup(&hr_dev->pd_bitmap);
 }
 
-int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                     struct ib_udata *udata)
+int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ib_dev = ibpd->device;
        struct hns_roce_dev *hr_dev = to_hr_dev(ib_dev);
@@ -72,7 +71,7 @@ int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
                return ret;
        }
 
-       if (context) {
+       if (udata) {
                struct hns_roce_ib_alloc_pd_resp uresp = {.pdn = pd->pdn};
 
                if (ib_copy_to_udata(udata, &uresp, sizeof(uresp))) {
@@ -86,7 +85,7 @@ int hns_roce_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
 }
 EXPORT_SYMBOL_GPL(hns_roce_alloc_pd);
 
-void hns_roce_dealloc_pd(struct ib_pd *pd)
+void hns_roce_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        hns_roce_pd_free(to_hr_dev(pd->device), to_hr_pd(pd)->pdn);
 }
index 60cf9f0..8db2817 100644 (file)
 
 void hns_roce_qp_event(struct hns_roce_dev *hr_dev, u32 qpn, int event_type)
 {
-       struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
        struct device *dev = hr_dev->dev;
        struct hns_roce_qp *qp;
 
-       spin_lock(&qp_table->lock);
-
+       xa_lock(&hr_dev->qp_table_xa);
        qp = __hns_roce_qp_lookup(hr_dev, qpn);
        if (qp)
                atomic_inc(&qp->refcount);
-
-       spin_unlock(&qp_table->lock);
+       xa_unlock(&hr_dev->qp_table_xa);
 
        if (!qp) {
                dev_warn(dev, "Async event for bogus QP %08x\n", qpn);
@@ -147,29 +144,20 @@ EXPORT_SYMBOL_GPL(to_hns_roce_state);
 static int hns_roce_gsi_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
                                 struct hns_roce_qp *hr_qp)
 {
-       struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+       struct xarray *xa = &hr_dev->qp_table_xa;
        int ret;
 
        if (!qpn)
                return -EINVAL;
 
        hr_qp->qpn = qpn;
-
-       spin_lock_irq(&qp_table->lock);
-       ret = radix_tree_insert(&hr_dev->qp_table_tree,
-                               hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
-       spin_unlock_irq(&qp_table->lock);
-       if (ret) {
-               dev_err(hr_dev->dev, "QPC radix_tree_insert failed\n");
-               goto err_put_irrl;
-       }
-
        atomic_set(&hr_qp->refcount, 1);
        init_completion(&hr_qp->free);
 
-       return 0;
-
-err_put_irrl:
+       ret = xa_err(xa_store_irq(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1),
+                               hr_qp, GFP_KERNEL));
+       if (ret)
+               dev_err(hr_dev->dev, "QPC xa_store failed\n");
 
        return ret;
 }
@@ -220,17 +208,9 @@ static int hns_roce_qp_alloc(struct hns_roce_dev *hr_dev, unsigned long qpn,
                }
        }
 
-       spin_lock_irq(&qp_table->lock);
-       ret = radix_tree_insert(&hr_dev->qp_table_tree,
-                               hr_qp->qpn & (hr_dev->caps.num_qps - 1), hr_qp);
-       spin_unlock_irq(&qp_table->lock);
-       if (ret) {
-               dev_err(dev, "QPC radix_tree_insert failed\n");
+       ret = hns_roce_gsi_qp_alloc(hr_dev, qpn, hr_qp);
+       if (ret)
                goto err_put_sccc;
-       }
-
-       atomic_set(&hr_qp->refcount, 1);
-       init_completion(&hr_qp->free);
 
        return 0;
 
@@ -255,13 +235,12 @@ err_out:
 
 void hns_roce_qp_remove(struct hns_roce_dev *hr_dev, struct hns_roce_qp *hr_qp)
 {
-       struct hns_roce_qp_table *qp_table = &hr_dev->qp_table;
+       struct xarray *xa = &hr_dev->qp_table_xa;
        unsigned long flags;
 
-       spin_lock_irqsave(&qp_table->lock, flags);
-       radix_tree_delete(&hr_dev->qp_table_tree,
-                         hr_qp->qpn & (hr_dev->caps.num_qps - 1));
-       spin_unlock_irqrestore(&qp_table->lock, flags);
+       xa_lock_irqsave(xa, flags);
+       __xa_erase(xa, hr_qp->qpn & (hr_dev->caps.num_qps - 1));
+       xa_unlock_irqrestore(xa, flags);
 }
 EXPORT_SYMBOL_GPL(hns_roce_qp_remove);
 
@@ -1154,8 +1133,7 @@ int hns_roce_init_qp_table(struct hns_roce_dev *hr_dev)
        int ret;
 
        mutex_init(&qp_table->scc_mutex);
-       spin_lock_init(&qp_table->lock);
-       INIT_RADIX_TREE(&hr_dev->qp_table_tree, GFP_ATOMIC);
+       xa_init(&hr_dev->qp_table_xa);
 
        /* In hw v1, a port include two SQP, six ports total 12 */
        if (hr_dev->caps.max_sq_sg <= 2)
diff --git a/drivers/infiniband/hw/hns/hns_roce_restrack.c b/drivers/infiniband/hw/hns/hns_roce_restrack.c
new file mode 100644 (file)
index 0000000..0a31d0a
--- /dev/null
@@ -0,0 +1,126 @@
+// SPDX-License-Identifier: (GPL-2.0 OR BSD-2-Clause)
+// Copyright (c) 2019 Hisilicon Limited.
+
+#include <rdma/rdma_cm.h>
+#include <rdma/restrack.h>
+#include <uapi/rdma/rdma_netlink.h>
+#include "hnae3.h"
+#include "hns_roce_common.h"
+#include "hns_roce_device.h"
+#include "hns_roce_hw_v2.h"
+
+static int hns_roce_fill_cq(struct sk_buff *msg,
+                           struct hns_roce_v2_cq_context *context)
+{
+       if (rdma_nl_put_driver_u32(msg, "state",
+                                  roce_get_field(context->byte_4_pg_ceqn,
+                                                 V2_CQC_BYTE_4_ARM_ST_M,
+                                                 V2_CQC_BYTE_4_ARM_ST_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(msg, "ceqn",
+                                  roce_get_field(context->byte_4_pg_ceqn,
+                                                 V2_CQC_BYTE_4_CEQN_M,
+                                                 V2_CQC_BYTE_4_CEQN_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(msg, "cqn",
+                                  roce_get_field(context->byte_8_cqn,
+                                                 V2_CQC_BYTE_8_CQN_M,
+                                                 V2_CQC_BYTE_8_CQN_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(msg, "hopnum",
+                                  roce_get_field(context->byte_16_hop_addr,
+                                                 V2_CQC_BYTE_16_CQE_HOP_NUM_M,
+                                                 V2_CQC_BYTE_16_CQE_HOP_NUM_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(
+                   msg, "pi",
+                   roce_get_field(context->byte_28_cq_pi,
+                                  V2_CQC_BYTE_28_CQ_PRODUCER_IDX_M,
+                                  V2_CQC_BYTE_28_CQ_PRODUCER_IDX_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(
+                   msg, "ci",
+                   roce_get_field(context->byte_32_cq_ci,
+                                  V2_CQC_BYTE_32_CQ_CONSUMER_IDX_M,
+                                  V2_CQC_BYTE_32_CQ_CONSUMER_IDX_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(
+                   msg, "coalesce",
+                   roce_get_field(context->byte_56_cqe_period_maxcnt,
+                                  V2_CQC_BYTE_56_CQ_MAX_CNT_M,
+                                  V2_CQC_BYTE_56_CQ_MAX_CNT_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(
+                   msg, "period",
+                   roce_get_field(context->byte_56_cqe_period_maxcnt,
+                                  V2_CQC_BYTE_56_CQ_PERIOD_M,
+                                  V2_CQC_BYTE_56_CQ_PERIOD_S)))
+               goto err;
+
+       if (rdma_nl_put_driver_u32(msg, "cnt",
+                                  roce_get_field(context->byte_52_cqe_cnt,
+                                                 V2_CQC_BYTE_52_CQE_CNT_M,
+                                                 V2_CQC_BYTE_52_CQE_CNT_S)))
+               goto err;
+
+       return 0;
+
+err:
+       return -EMSGSIZE;
+}
+
+static int hns_roce_fill_res_cq_entry(struct sk_buff *msg,
+                                     struct rdma_restrack_entry *res)
+{
+       struct ib_cq *ib_cq = container_of(res, struct ib_cq, res);
+       struct hns_roce_dev *hr_dev = to_hr_dev(ib_cq->device);
+       struct hns_roce_cq *hr_cq = to_hr_cq(ib_cq);
+       struct hns_roce_v2_cq_context *context;
+       struct nlattr *table_attr;
+       int ret;
+
+       if (!hr_dev->dfx->query_cqc_info)
+               return -EINVAL;
+
+       context = kzalloc(sizeof(struct hns_roce_v2_cq_context), GFP_KERNEL);
+       if (!context)
+               return -ENOMEM;
+
+       ret = hr_dev->dfx->query_cqc_info(hr_dev, hr_cq->cqn, (int *)context);
+       if (ret)
+               goto err;
+
+       table_attr = nla_nest_start(msg, RDMA_NLDEV_ATTR_DRIVER);
+       if (!table_attr)
+               goto err;
+
+       if (hns_roce_fill_cq(msg, context))
+               goto err_cancel_table;
+
+       nla_nest_end(msg, table_attr);
+       kfree(context);
+
+       return 0;
+
+err_cancel_table:
+       nla_nest_cancel(msg, table_attr);
+err:
+       kfree(context);
+       return -EMSGSIZE;
+}
+
+int hns_roce_fill_res_entry(struct sk_buff *msg,
+                           struct rdma_restrack_entry *res)
+{
+       if (res->type == RDMA_RESTRACK_CQ)
+               return hns_roce_fill_res_cq_entry(msg, res);
+
+       return 0;
+}
index a8ee2f6..b3421b1 100644 (file)
@@ -206,13 +206,13 @@ static int hns_roce_create_idx_que(struct ib_pd *pd, struct hns_roce_srq *srq,
        return 0;
 }
 
-struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
-                                  struct ib_srq_init_attr *srq_init_attr,
-                                  struct ib_udata *udata)
+int hns_roce_create_srq(struct ib_srq *ib_srq,
+                       struct ib_srq_init_attr *srq_init_attr,
+                       struct ib_udata *udata)
 {
-       struct hns_roce_dev *hr_dev = to_hr_dev(pd->device);
+       struct hns_roce_dev *hr_dev = to_hr_dev(ib_srq->device);
        struct hns_roce_ib_create_srq_resp resp = {};
-       struct hns_roce_srq *srq;
+       struct hns_roce_srq *srq = to_hr_srq(ib_srq);
        int srq_desc_size;
        int srq_buf_size;
        u32 page_shift;
@@ -223,11 +223,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
        /* Check the actual SRQ wqe and SRQ sge num */
        if (srq_init_attr->attr.max_wr >= hr_dev->caps.max_srq_wrs ||
            srq_init_attr->attr.max_sge > hr_dev->caps.max_srq_sges)
-               return ERR_PTR(-EINVAL);
-
-       srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        mutex_init(&srq->mutex);
        spin_lock_init(&srq->lock);
@@ -249,17 +245,13 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
        if (udata) {
                struct hns_roce_ib_create_srq  ucmd;
 
-               if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
-                       ret = -EFAULT;
-                       goto err_srq;
-               }
+               if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+                       return -EFAULT;
 
                srq->umem =
                        ib_umem_get(udata, ucmd.buf_addr, srq_buf_size, 0, 0);
-               if (IS_ERR(srq->umem)) {
-                       ret = PTR_ERR(srq->umem);
-                       goto err_srq;
-               }
+               if (IS_ERR(srq->umem))
+                       return PTR_ERR(srq->umem);
 
                if (hr_dev->caps.srqwqe_buf_pg_sz) {
                        npages = (ib_umem_page_count(srq->umem) +
@@ -321,11 +313,9 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
        } else {
                page_shift = PAGE_SHIFT + hr_dev->caps.srqwqe_buf_pg_sz;
                if (hns_roce_buf_alloc(hr_dev, srq_buf_size,
-                                     (1 << page_shift) * 2,
-                                     &srq->buf, page_shift)) {
-                       ret = -ENOMEM;
-                       goto err_srq;
-               }
+                                      (1 << page_shift) * 2, &srq->buf,
+                                      page_shift))
+                       return -ENOMEM;
 
                srq->head = 0;
                srq->tail = srq->max - 1;
@@ -340,7 +330,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
                        goto err_srq_mtt;
 
                page_shift = PAGE_SHIFT + hr_dev->caps.idx_buf_pg_sz;
-               ret = hns_roce_create_idx_que(pd, srq, page_shift);
+               ret = hns_roce_create_idx_que(ib_srq->pd, srq, page_shift);
                if (ret) {
                        dev_err(hr_dev->dev, "Create idx queue fail(%d)!\n",
                                ret);
@@ -372,7 +362,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
 
        srq->db_reg_l = hr_dev->reg_base + SRQ_DB_REG;
 
-       ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(pd)->pdn, cqn, 0,
+       ret = hns_roce_srq_alloc(hr_dev, to_hr_pd(ib_srq->pd)->pdn, cqn, 0,
                                 &srq->mtt, 0, srq);
        if (ret)
                goto err_wrid;
@@ -389,7 +379,7 @@ struct ib_srq *hns_roce_create_srq(struct ib_pd *pd,
                }
        }
 
-       return &srq->ibsrq;
+       return 0;
 
 err_srqc_alloc:
        hns_roce_srq_free(hr_dev, srq);
@@ -418,12 +408,10 @@ err_buf:
        else
                hns_roce_buf_free(hr_dev, srq_buf_size, &srq->buf);
 
-err_srq:
-       kfree(srq);
-       return ERR_PTR(ret);
+       return ret;
 }
 
-int hns_roce_destroy_srq(struct ib_srq *ibsrq)
+void hns_roce_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
        struct hns_roce_dev *hr_dev = to_hr_dev(ibsrq->device);
        struct hns_roce_srq *srq = to_hr_srq(ibsrq);
@@ -440,10 +428,6 @@ int hns_roce_destroy_srq(struct ib_srq *ibsrq)
                hns_roce_buf_free(hr_dev, srq->max << srq->wqe_shift,
                                  &srq->buf);
        }
-
-       kfree(srq);
-
-       return 0;
 }
 
 int hns_roce_init_srq_table(struct hns_roce_dev *hr_dev)
index 2f2b442..8feec35 100644 (file)
@@ -552,7 +552,7 @@ enum i40iw_status_code i40iw_obj_aligned_mem(struct i40iw_device *iwdev,
 
 void i40iw_request_reset(struct i40iw_device *iwdev);
 void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev);
-void i40iw_setup_cm_core(struct i40iw_device *iwdev);
+int i40iw_setup_cm_core(struct i40iw_device *iwdev);
 void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core);
 void i40iw_process_ceq(struct i40iw_device *, struct i40iw_ceq *iwceq);
 void i40iw_process_aeq(struct i40iw_device *);
index 206cfb0..8233f5a 100644 (file)
@@ -3237,7 +3237,7 @@ void i40iw_receive_ilq(struct i40iw_sc_vsi *vsi, struct i40iw_puda_buf *rbuf)
  * core
  * @iwdev: iwarp device structure
  */
-void i40iw_setup_cm_core(struct i40iw_device *iwdev)
+int i40iw_setup_cm_core(struct i40iw_device *iwdev)
 {
        struct i40iw_cm_core *cm_core = &iwdev->cm_core;
 
@@ -3256,9 +3256,19 @@ void i40iw_setup_cm_core(struct i40iw_device *iwdev)
 
        cm_core->event_wq = alloc_ordered_workqueue("iwewq",
                                                    WQ_MEM_RECLAIM);
+       if (!cm_core->event_wq)
+               goto error;
 
        cm_core->disconn_wq = alloc_ordered_workqueue("iwdwq",
                                                      WQ_MEM_RECLAIM);
+       if (!cm_core->disconn_wq)
+               goto error;
+
+       return 0;
+error:
+       i40iw_cleanup_cm_core(&iwdev->cm_core);
+
+       return -ENOMEM;
 }
 
 /**
@@ -3278,8 +3288,10 @@ void i40iw_cleanup_cm_core(struct i40iw_cm_core *cm_core)
                del_timer_sync(&cm_core->tcp_timer);
        spin_unlock_irqrestore(&cm_core->ht_lock, flags);
 
-       destroy_workqueue(cm_core->event_wq);
-       destroy_workqueue(cm_core->disconn_wq);
+       if (cm_core->event_wq)
+               destroy_workqueue(cm_core->event_wq);
+       if (cm_core->disconn_wq)
+               destroy_workqueue(cm_core->disconn_wq);
 }
 
 /**
@@ -3478,7 +3490,8 @@ static void i40iw_qp_disconnect(struct i40iw_qp *iwqp)
                /* Need to free the Last Streaming Mode Message */
                if (iwqp->ietf_mem.va) {
                        if (iwqp->lsmm_mr)
-                               iwibdev->ibdev.ops.dereg_mr(iwqp->lsmm_mr);
+                               iwibdev->ibdev.ops.dereg_mr(iwqp->lsmm_mr,
+                                                           NULL);
                        i40iw_free_dma_mem(iwdev->sc_dev.hw, &iwqp->ietf_mem);
                }
        }
index 68095f0..10932ba 100644 (file)
@@ -1641,7 +1641,10 @@ static int i40iw_open(struct i40e_info *ldev, struct i40e_client *client)
        iwdev = &hdl->device;
        iwdev->hdl = hdl;
        dev = &iwdev->sc_dev;
-       i40iw_setup_cm_core(iwdev);
+       if (i40iw_setup_cm_core(iwdev)) {
+               kfree(iwdev->hdl);
+               return -ENOMEM;
+       }
 
        dev->back_dev = (void *)iwdev;
        iwdev->ldev = &hdl->ldev;
index a8352e3..5689d74 100644 (file)
@@ -291,18 +291,15 @@ static void i40iw_dealloc_push_page(struct i40iw_device *iwdev, struct i40iw_sc_
 /**
  * i40iw_alloc_pd - allocate protection domain
  * @pd: PD pointer
- * @context: user context created during alloc
  * @udata: user data
  */
-static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                         struct ib_udata *udata)
+static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct i40iw_pd *iwpd = to_iwpd(pd);
        struct i40iw_device *iwdev = to_iwdev(pd->device);
        struct i40iw_sc_dev *dev = &iwdev->sc_dev;
        struct i40iw_alloc_pd_resp uresp;
        struct i40iw_sc_pd *sc_pd;
-       struct i40iw_ucontext *ucontext;
        u32 pd_id = 0;
        int err;
 
@@ -318,8 +315,9 @@ static int i40iw_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
 
        sc_pd = &iwpd->sc_pd;
 
-       if (context) {
-               ucontext = to_ucontext(context);
+       if (udata) {
+               struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+                       udata, struct i40iw_ucontext, ibucontext);
                dev->iw_pd_ops->pd_init(dev, sc_pd, pd_id, ucontext->abi_ver);
                memset(&uresp, 0, sizeof(uresp));
                uresp.pd_id = pd_id;
@@ -342,8 +340,9 @@ error:
 /**
  * i40iw_dealloc_pd - deallocate pd
  * @ibpd: ptr of pd to be deallocated
+ * @udata: user data or null for kernel object
  */
-static void i40iw_dealloc_pd(struct ib_pd *ibpd)
+static void i40iw_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct i40iw_pd *iwpd = to_iwpd(ibpd);
        struct i40iw_device *iwdev = to_iwdev(ibpd->device);
@@ -413,7 +412,7 @@ static void i40iw_clean_cqes(struct i40iw_qp *iwqp, struct i40iw_cq *iwcq)
  * i40iw_destroy_qp - destroy qp
  * @ibqp: qp's ib pointer also to get to device's qp address
  */
-static int i40iw_destroy_qp(struct ib_qp *ibqp)
+static int i40iw_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct i40iw_qp *iwqp = to_iwqp(ibqp);
 
@@ -744,8 +743,8 @@ static struct ib_qp *i40iw_create_qp(struct ib_pd *ibpd,
                err_code = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
                if (err_code) {
                        i40iw_pr_err("copy_to_udata failed\n");
-                       i40iw_destroy_qp(&iwqp->ibqp);
-                          /* let the completion of the qp destroy free the qp */
+                       i40iw_destroy_qp(&iwqp->ibqp, udata);
+                       /* let the completion of the qp destroy free the qp */
                        return ERR_PTR(err_code);
                }
        }
@@ -1063,8 +1062,9 @@ void i40iw_cq_wq_destroy(struct i40iw_device *iwdev, struct i40iw_sc_cq *cq)
 /**
  * i40iw_destroy_cq - destroy cq
  * @ib_cq: cq pointer
+ * @udata: user data or NULL for kernel object
  */
-static int i40iw_destroy_cq(struct ib_cq *ib_cq)
+static int i40iw_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct i40iw_cq *iwcq;
        struct i40iw_device *iwdev;
@@ -1089,12 +1089,10 @@ static int i40iw_destroy_cq(struct ib_cq *ib_cq)
  * i40iw_create_cq - create cq
  * @ibdev: device pointer from stack
  * @attr: attributes for cq
- * @context: user context created during alloc
  * @udata: user data
  */
 static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
                                     const struct ib_cq_init_attr *attr,
-                                    struct ib_ucontext *context,
                                     struct ib_udata *udata)
 {
        struct i40iw_device *iwdev = to_iwdev(ibdev);
@@ -1144,14 +1142,14 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
        info.ceq_id_valid = true;
        info.ceqe_mask = 1;
        info.type = I40IW_CQ_TYPE_IWARP;
-       if (context) {
-               struct i40iw_ucontext *ucontext;
+       if (udata) {
+               struct i40iw_ucontext *ucontext = rdma_udata_to_drv_context(
+                       udata, struct i40iw_ucontext, ibucontext);
                struct i40iw_create_cq_req req;
                struct i40iw_cq_mr *cqmr;
 
                memset(&req, 0, sizeof(req));
                iwcq->user_mode = true;
-               ucontext = to_ucontext(context);
                if (ib_copy_from_udata(&req, udata, sizeof(struct i40iw_create_cq_req))) {
                        err_code = -EFAULT;
                        goto cq_free_resources;
@@ -1221,7 +1219,7 @@ static struct ib_cq *i40iw_create_cq(struct ib_device *ibdev,
                goto cq_free_resources;
        }
 
-       if (context) {
+       if (udata) {
                struct i40iw_create_cq_resp resp;
 
                memset(&resp, 0, sizeof(resp));
@@ -1340,52 +1338,21 @@ static void i40iw_copy_user_pgaddrs(struct i40iw_mr *iwmr,
        struct i40iw_pbl *iwpbl = &iwmr->iwpbl;
        struct i40iw_pble_alloc *palloc = &iwpbl->pble_alloc;
        struct i40iw_pble_info *pinfo;
-       struct sg_dma_page_iter sg_iter;
-       u64 pg_addr = 0;
+       struct ib_block_iter biter;
        u32 idx = 0;
-       bool first_pg = true;
 
        pinfo = (level == I40IW_LEVEL_1) ? NULL : palloc->level2.leaf;
 
        if (iwmr->type == IW_MEMREG_TYPE_QP)
                iwpbl->qp_mr.sq_page = sg_page(region->sg_head.sgl);
 
-       for_each_sg_dma_page (region->sg_head.sgl, &sg_iter, region->nmap, 0) {
-               pg_addr = sg_page_iter_dma_address(&sg_iter);
-               if (first_pg)
-                       *pbl = cpu_to_le64(pg_addr & iwmr->page_msk);
-               else if (!(pg_addr & ~iwmr->page_msk))
-                       *pbl = cpu_to_le64(pg_addr);
-               else
-                       continue;
-
-               first_pg = false;
+       rdma_for_each_block(region->sg_head.sgl, &biter, region->nmap,
+                           iwmr->page_size) {
+               *pbl = rdma_block_iter_dma_address(&biter);
                pbl = i40iw_next_pbl_addr(pbl, &pinfo, &idx);
        }
 }
 
-/**
- * i40iw_set_hugetlb_params - set MR pg size and mask to huge pg values.
- * @addr: virtual address
- * @iwmr: mr pointer for this memory registration
- */
-static void i40iw_set_hugetlb_values(u64 addr, struct i40iw_mr *iwmr)
-{
-       struct vm_area_struct *vma;
-       struct hstate *h;
-
-       down_read(&current->mm->mmap_sem);
-       vma = find_vma(current->mm, addr);
-       if (vma && is_vm_hugetlb_page(vma)) {
-               h = hstate_vma(vma);
-               if (huge_page_size(h) == 0x200000) {
-                       iwmr->page_size = huge_page_size(h);
-                       iwmr->page_msk = huge_page_mask(h);
-               }
-       }
-       up_read(&current->mm->mmap_sem);
-}
-
 /**
  * i40iw_check_mem_contiguous - check if pbls stored in arr are contiguous
  * @arr: lvl1 pbl array
@@ -1601,10 +1568,10 @@ static int i40iw_hw_alloc_stag(struct i40iw_device *iwdev, struct i40iw_mr *iwmr
  * @pd: ibpd pointer
  * @mr_type: memory for stag registrion
  * @max_num_sg: man number of pages
+ * @udata: user data or NULL for kernel objects
  */
-static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd,
-                                   enum ib_mr_type mr_type,
-                                   u32 max_num_sg)
+static struct ib_mr *i40iw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                                   u32 max_num_sg, struct ib_udata *udata)
 {
        struct i40iw_pd *iwpd = to_iwpd(pd);
        struct i40iw_device *iwdev = to_iwdev(pd->device);
@@ -1841,10 +1808,9 @@ static struct ib_mr *i40iw_reg_user_mr(struct ib_pd *pd,
        iwmr->ibmr.device = pd->device;
 
        iwmr->page_size = PAGE_SIZE;
-       iwmr->page_msk = PAGE_MASK;
-
-       if (region->hugetlb && (req.reg_type == IW_MEMREG_TYPE_MEM))
-               i40iw_set_hugetlb_values(start, iwmr);
+       if (req.reg_type == IW_MEMREG_TYPE_MEM)
+               iwmr->page_size = ib_umem_find_best_pgsz(region, SZ_4K | SZ_2M,
+                                                        virt);
 
        region_length = region->length + (start & (iwmr->page_size - 1));
        pg_shift = ffs(iwmr->page_size) - 1;
@@ -2038,7 +2004,7 @@ static void i40iw_del_memlist(struct i40iw_mr *iwmr,
  * i40iw_dereg_mr - deregister mr
  * @ib_mr: mr ptr for dereg
  */
-static int i40iw_dereg_mr(struct ib_mr *ib_mr)
+static int i40iw_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct ib_pd *ibpd = ib_mr->pd;
        struct i40iw_pd *iwpd = to_iwpd(ibpd);
@@ -2058,9 +2024,12 @@ static int i40iw_dereg_mr(struct ib_mr *ib_mr)
        if (iwmr->type != IW_MEMREG_TYPE_MEM) {
                /* region is released. only test for userness. */
                if (iwmr->region) {
-                       struct i40iw_ucontext *ucontext;
+                       struct i40iw_ucontext *ucontext =
+                               rdma_udata_to_drv_context(
+                                       udata,
+                                       struct i40iw_ucontext,
+                                       ibucontext);
 
-                       ucontext = to_ucontext(ibpd->uobject->context);
                        i40iw_del_memlist(iwmr, ucontext);
                }
                if (iwpbl->pbl_allocated && iwmr->type != IW_MEMREG_TYPE_QP)
@@ -2703,6 +2672,14 @@ static const struct ib_device_ops i40iw_dev_ops = {
        .get_dma_mr = i40iw_get_dma_mr,
        .get_hw_stats = i40iw_get_hw_stats,
        .get_port_immutable = i40iw_port_immutable,
+       .iw_accept = i40iw_accept,
+       .iw_add_ref = i40iw_add_ref,
+       .iw_connect = i40iw_connect,
+       .iw_create_listen = i40iw_create_listen,
+       .iw_destroy_listen = i40iw_destroy_listen,
+       .iw_get_qp = i40iw_get_qp,
+       .iw_reject = i40iw_reject,
+       .iw_rem_ref = i40iw_rem_ref,
        .map_mr_sg = i40iw_map_mr_sg,
        .mmap = i40iw_mmap,
        .modify_qp = i40iw_modify_qp,
@@ -2766,22 +2743,8 @@ static struct i40iw_ib_device *i40iw_init_rdma_device(struct i40iw_device *iwdev
        iwibdev->ibdev.phys_port_cnt = 1;
        iwibdev->ibdev.num_comp_vectors = iwdev->ceqs_count;
        iwibdev->ibdev.dev.parent = &pcidev->dev;
-       iwibdev->ibdev.iwcm = kzalloc(sizeof(*iwibdev->ibdev.iwcm), GFP_KERNEL);
-       if (!iwibdev->ibdev.iwcm) {
-               ib_dealloc_device(&iwibdev->ibdev);
-               return NULL;
-       }
-
-       iwibdev->ibdev.iwcm->add_ref = i40iw_add_ref;
-       iwibdev->ibdev.iwcm->rem_ref = i40iw_rem_ref;
-       iwibdev->ibdev.iwcm->get_qp = i40iw_get_qp;
-       iwibdev->ibdev.iwcm->connect = i40iw_connect;
-       iwibdev->ibdev.iwcm->accept = i40iw_accept;
-       iwibdev->ibdev.iwcm->reject = i40iw_reject;
-       iwibdev->ibdev.iwcm->create_listen = i40iw_create_listen;
-       iwibdev->ibdev.iwcm->destroy_listen = i40iw_destroy_listen;
-       memcpy(iwibdev->ibdev.iwcm->ifname, netdev->name,
-              sizeof(iwibdev->ibdev.iwcm->ifname));
+       memcpy(iwibdev->ibdev.iw_ifname, netdev->name,
+              sizeof(iwibdev->ibdev.iw_ifname));
        ib_set_device_ops(&iwibdev->ibdev, &i40iw_dev_ops);
 
        return iwibdev;
@@ -2812,8 +2775,6 @@ void i40iw_destroy_rdma_device(struct i40iw_ib_device *iwibdev)
                return;
 
        ib_unregister_device(&iwibdev->ibdev);
-       kfree(iwibdev->ibdev.iwcm);
-       iwibdev->ibdev.iwcm = NULL;
        wait_event_timeout(iwibdev->iwdev->close_wq,
                           !atomic64_read(&iwibdev->iwdev->use_count),
                           I40IW_EVENT_TIMEOUT);
@@ -2841,8 +2802,6 @@ int i40iw_register_rdma_device(struct i40iw_device *iwdev)
 
        return 0;
 error:
-       kfree(iwdev->iwibdev->ibdev.iwcm);
-       iwdev->iwibdev->ibdev.iwcm = NULL;
        ib_dealloc_device(&iwdev->iwibdev->ibdev);
        return ret;
 }
index 76cf173..3a41375 100644 (file)
@@ -94,8 +94,7 @@ struct i40iw_mr {
        struct ib_umem *region;
        u16 type;
        u32 page_cnt;
-       u32 page_size;
-       u64 page_msk;
+       u64 page_size;
        u32 npages;
        u32 stag;
        u64 length;
index 1672808..02a169f 100644 (file)
 
 #include "mlx4_ib.h"
 
-static struct ib_ah *create_ib_ah(struct ib_pd *pd,
-                                 struct rdma_ah_attr *ah_attr,
-                                 struct mlx4_ib_ah *ah)
+static void create_ib_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
 {
-       struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+       struct mlx4_ib_ah *ah = to_mah(ib_ah);
+       struct mlx4_dev *dev = to_mdev(ib_ah->device)->dev;
 
-       ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+       ah->av.ib.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
                            (rdma_ah_get_port_num(ah_attr) << 24));
        ah->av.ib.g_slid  = rdma_ah_get_path_bits(ah_attr);
        ah->av.ib.sl_tclass_flowlabel =
@@ -73,15 +72,12 @@ static struct ib_ah *create_ib_ah(struct ib_pd *pd,
                        --static_rate;
                ah->av.ib.stat_rate = static_rate;
        }
-
-       return &ah->ibah;
 }
 
-static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
-                                   struct rdma_ah_attr *ah_attr,
-                                   struct mlx4_ib_ah *ah)
+static int create_iboe_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr)
 {
-       struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+       struct mlx4_ib_dev *ibdev = to_mdev(ib_ah->device);
+       struct mlx4_ib_ah *ah = to_mah(ib_ah);
        const struct ib_gid_attr *gid_attr;
        struct mlx4_dev *dev = ibdev->dev;
        int is_mcast = 0;
@@ -103,12 +99,14 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
         */
        gid_attr = ah_attr->grh.sgid_attr;
        if (gid_attr) {
-               if (is_vlan_dev(gid_attr->ndev))
-                       vlan_tag = vlan_dev_vlan_id(gid_attr->ndev);
-               memcpy(ah->av.eth.s_mac, gid_attr->ndev->dev_addr, ETH_ALEN);
+               ret = rdma_read_gid_l2_fields(gid_attr, &vlan_tag,
+                                             &ah->av.eth.s_mac[0]);
+               if (ret)
+                       return ret;
+
                ret = mlx4_ib_gid_index_to_real_index(ibdev, gid_attr);
                if (ret < 0)
-                       return ERR_PTR(ret);
+                       return ret;
                ah->av.eth.gid_index = ret;
        } else {
                /* mlx4_ib_create_ah_slave fills in the s_mac and the vlan */
@@ -117,7 +115,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
 
        if (vlan_tag < 0x1000)
                vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
-       ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn |
+       ah->av.eth.port_pd = cpu_to_be32(to_mpd(ib_ah->pd)->pdn |
                                         (rdma_ah_get_port_num(ah_attr) << 24));
        ah->av.eth.vlan = cpu_to_be16(vlan_tag);
        ah->av.eth.hop_limit = grh->hop_limit;
@@ -140,63 +138,45 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd,
        memcpy(ah->av.eth.dgid, grh->dgid.raw, 16);
        ah->av.eth.sl_tclass_flowlabel |= cpu_to_be32(rdma_ah_get_sl(ah_attr)
                                                      << 29);
-       return &ah->ibah;
+       return 0;
 }
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                               u32 flags, struct ib_udata *udata)
+int mlx4_ib_create_ah(struct ib_ah *ib_ah, struct rdma_ah_attr *ah_attr,
+                     u32 flags, struct ib_udata *udata)
 
 {
-       struct mlx4_ib_ah *ah;
-       struct ib_ah *ret;
-
-       ah = kzalloc(sizeof *ah, GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
-
        if (ah_attr->type == RDMA_AH_ATTR_TYPE_ROCE) {
-               if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH)) {
-                       ret = ERR_PTR(-EINVAL);
-               } else {
-                       /*
-                        * TBD: need to handle the case when we get
-                        * called in an atomic context and there we
-                        * might sleep.  We don't expect this
-                        * currently since we're working with link
-                        * local addresses which we can translate
-                        * without going to sleep.
-                        */
-                       ret = create_iboe_ah(pd, ah_attr, ah);
-               }
-
-               if (IS_ERR(ret))
-                       kfree(ah);
+               if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
+                       return -EINVAL;
+               /*
+                * TBD: need to handle the case when we get
+                * called in an atomic context and there we
+                * might sleep.  We don't expect this
+                * currently since we're working with link
+                * local addresses which we can translate
+                * without going to sleep.
+                */
+               return create_iboe_ah(ib_ah, ah_attr);
+       }
 
-               return ret;
-       } else
-               return create_ib_ah(pd, ah_attr, ah); /* never fails */
+       create_ib_ah(ib_ah, ah_attr);
+       return 0;
 }
 
-/* AH's created via this call must be free'd by mlx4_ib_destroy_ah. */
-struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
-                                     struct rdma_ah_attr *ah_attr,
-                                     int slave_sgid_index, u8 *s_mac,
-                                     u16 vlan_tag)
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+                           int slave_sgid_index, u8 *s_mac, u16 vlan_tag)
 {
        struct rdma_ah_attr slave_attr = *ah_attr;
-       struct mlx4_ib_ah *mah;
-       struct ib_ah *ah;
+       struct mlx4_ib_ah *mah = to_mah(ah);
+       int ret;
 
        slave_attr.grh.sgid_attr = NULL;
        slave_attr.grh.sgid_index = slave_sgid_index;
-       ah = mlx4_ib_create_ah(pd, &slave_attr, 0, NULL);
-       if (IS_ERR(ah))
-               return ah;
+       ret = mlx4_ib_create_ah(ah, &slave_attr, 0, NULL);
+       if (ret)
+               return ret;
 
-       ah->device = pd->device;
-       ah->pd = pd;
        ah->type = ah_attr->type;
-       mah = to_mah(ah);
 
        /* get rid of force-loopback bit */
        mah->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
@@ -208,7 +188,7 @@ struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
                vlan_tag |= (rdma_ah_get_sl(ah_attr) & 7) << 13;
        mah->av.eth.vlan = cpu_to_be16(vlan_tag);
 
-       return ah;
+       return 0;
 }
 
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -250,8 +230,7 @@ int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
        return 0;
 }
 
-int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
+void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-       kfree(to_mah(ah));
-       return 0;
+       return;
 }
index 8c79a48..ecd6cad 100644 (file)
@@ -168,20 +168,17 @@ static void id_map_ent_timeout(struct work_struct *work)
 {
        struct delayed_work *delay = to_delayed_work(work);
        struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
-       struct id_map_entry *db_ent, *found_ent;
+       struct id_map_entry *found_ent;
        struct mlx4_ib_dev *dev = ent->dev;
        struct mlx4_ib_sriov *sriov = &dev->sriov;
        struct rb_root *sl_id_map = &sriov->sl_id_map;
-       int pv_id = (int) ent->pv_cm_id;
 
        spin_lock(&sriov->id_map_lock);
-       db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
-       if (!db_ent)
+       if (!xa_erase(&sriov->pv_id_table, ent->pv_cm_id))
                goto out;
        found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
        if (found_ent && found_ent == ent)
                rb_erase(&found_ent->node, sl_id_map);
-       idr_remove(&sriov->pv_id_table, pv_id);
 
 out:
        list_del(&ent->list);
@@ -196,13 +193,12 @@ static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
        struct id_map_entry *ent, *found_ent;
 
        spin_lock(&sriov->id_map_lock);
-       ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+       ent = xa_erase(&sriov->pv_id_table, pv_cm_id);
        if (!ent)
                goto out;
        found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
        if (found_ent && found_ent == ent)
                rb_erase(&found_ent->node, sl_id_map);
-       idr_remove(&sriov->pv_id_table, pv_cm_id);
 out:
        spin_unlock(&sriov->id_map_lock);
 }
@@ -256,25 +252,19 @@ id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
        ent->dev = to_mdev(ibdev);
        INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
 
-       idr_preload(GFP_KERNEL);
-       spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
-
-       ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+       ret = xa_alloc_cyclic(&sriov->pv_id_table, &ent->pv_cm_id, ent,
+                       xa_limit_32b, &sriov->pv_id_next, GFP_KERNEL);
        if (ret >= 0) {
-               ent->pv_cm_id = (u32)ret;
+               spin_lock(&sriov->id_map_lock);
                sl_id_map_add(ibdev, ent);
                list_add_tail(&ent->list, &sriov->cm_list);
-       }
-
-       spin_unlock(&sriov->id_map_lock);
-       idr_preload_end();
-
-       if (ret >= 0)
+               spin_unlock(&sriov->id_map_lock);
                return ent;
+       }
 
        /*error flow*/
        kfree(ent);
-       mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+       mlx4_ib_warn(ibdev, "Allocation failed (err:0x%x)\n", ret);
        return ERR_PTR(-ENOMEM);
 }
 
@@ -290,7 +280,7 @@ id_map_get(struct ib_device *ibdev, int *pv_cm_id, int slave_id, int sl_cm_id)
                if (ent)
                        *pv_cm_id = (int) ent->pv_cm_id;
        } else
-               ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+               ent = xa_load(&sriov->pv_id_table, *pv_cm_id);
        spin_unlock(&sriov->id_map_lock);
 
        return ent;
@@ -407,7 +397,7 @@ void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
        spin_lock_init(&dev->sriov.id_map_lock);
        INIT_LIST_HEAD(&dev->sriov.cm_list);
        dev->sriov.sl_id_map = RB_ROOT;
-       idr_init(&dev->sriov.pv_id_table);
+       xa_init_flags(&dev->sriov.pv_id_table, XA_FLAGS_ALLOC);
 }
 
 /* slave = -1 ==> all slaves */
@@ -444,7 +434,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
                                         struct id_map_entry, node);
 
                        rb_erase(&ent->node, sl_id_map);
-                       idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+                       xa_erase(&sriov->pv_id_table, ent->pv_cm_id);
                }
                list_splice_init(&dev->sriov.cm_list, &lh);
        } else {
@@ -460,7 +450,7 @@ void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
                /* remove those nodes from databases */
                list_for_each_entry_safe(map, tmp_map, &lh, list) {
                        rb_erase(&map->node, sl_id_map);
-                       idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+                       xa_erase(&sriov->pv_id_table, map->pv_cm_id);
                }
 
                /* add remaining nodes from cm_list */
index 03ac723..022a0b4 100644 (file)
@@ -38,6 +38,7 @@
 
 #include "mlx4_ib.h"
 #include <rdma/mlx4-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
 {
@@ -173,7 +174,6 @@ err_buf:
 #define CQ_CREATE_FLAGS_SUPPORTED IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -183,6 +183,8 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
        struct mlx4_uar *uar;
        void *buf_addr;
        int err;
+       struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx4_ib_ucontext, ibucontext);
 
        if (entries < 1 || entries > dev->dev->caps.max_cqes)
                return ERR_PTR(-EINVAL);
@@ -204,7 +206,7 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
        INIT_LIST_HEAD(&cq->send_qp_list);
        INIT_LIST_HEAD(&cq->recv_qp_list);
 
-       if (context) {
+       if (udata) {
                struct mlx4_ib_create_cq ucmd;
 
                if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
@@ -218,12 +220,11 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
                if (err)
                        goto err_cq;
 
-               err = mlx4_ib_db_map_user(to_mucontext(context), udata,
-                                         ucmd.db_addr, &cq->db);
+               err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &cq->db);
                if (err)
                        goto err_mtt;
 
-               uar = &to_mucontext(context)->uar;
+               uar = &context->uar;
                cq->mcq.usage = MLX4_RES_USAGE_USER_VERBS;
        } else {
                err = mlx4_db_alloc(dev->dev, &cq->db, 1);
@@ -248,21 +249,21 @@ struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
        if (dev->eq_table)
                vector = dev->eq_table[vector % ibdev->num_comp_vectors];
 
-       err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
-                           cq->db.dma, &cq->mcq, vector, 0,
+       err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar, cq->db.dma,
+                           &cq->mcq, vector, 0,
                            !!(cq->create_flags &
                               IB_UVERBS_CQ_FLAGS_TIMESTAMP_COMPLETION),
-                           buf_addr, !!context);
+                           buf_addr, !!udata);
        if (err)
                goto err_dbmap;
 
-       if (context)
+       if (udata)
                cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
        else
                cq->mcq.comp = mlx4_ib_cq_comp;
        cq->mcq.event = mlx4_ib_cq_event;
 
-       if (context)
+       if (udata)
                if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
                        err = -EFAULT;
                        goto err_cq_free;
@@ -274,19 +275,19 @@ err_cq_free:
        mlx4_cq_free(dev->dev, &cq->mcq);
 
 err_dbmap:
-       if (context)
-               mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+       if (udata)
+               mlx4_ib_db_unmap_user(context, &cq->db);
 
 err_mtt:
        mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
 
-       if (context)
+       if (udata)
                ib_umem_release(cq->umem);
        else
                mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
 
 err_db:
-       if (!context)
+       if (!udata)
                mlx4_db_free(dev->dev, &cq->db);
 
 err_cq:
@@ -485,7 +486,7 @@ out:
        return err;
 }
 
-int mlx4_ib_destroy_cq(struct ib_cq *cq)
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(cq->device);
        struct mlx4_ib_cq *mcq = to_mcq(cq);
@@ -493,8 +494,13 @@ int mlx4_ib_destroy_cq(struct ib_cq *cq)
        mlx4_cq_free(dev->dev, &mcq->mcq);
        mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
 
-       if (cq->uobject) {
-               mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+       if (udata) {
+               mlx4_ib_db_unmap_user(
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mlx4_ib_ucontext,
+                               ibucontext),
+                       &mcq->db);
                ib_umem_release(mcq->umem);
        } else {
                mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
index 3aab71b..0f39035 100644 (file)
@@ -31,6 +31,7 @@
  */
 
 #include <linux/slab.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "mlx4_ib.h"
 
@@ -41,12 +42,13 @@ struct mlx4_ib_user_db_page {
        int                     refcnt;
 };
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
-                       struct ib_udata *udata, unsigned long virt,
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
                        struct mlx4_db *db)
 {
        struct mlx4_ib_user_db_page *page;
        int err = 0;
+       struct mlx4_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx4_ib_ucontext, ibucontext);
 
        mutex_lock(&context->db_page_mutex);
 
index 936ee13..68c9514 100644 (file)
@@ -1371,9 +1371,9 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
        struct ib_ah *ah;
        struct ib_qp *send_qp = NULL;
        unsigned wire_tx_ix = 0;
-       int ret = 0;
        u16 wire_pkey_ix;
        int src_qpnum;
+       int ret;
 
        sqp_ctx = dev->sriov.sqps[port-1];
 
@@ -1393,12 +1393,20 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
 
        send_qp = sqp->qp;
 
-       /* create ah */
-       ah = mlx4_ib_create_ah_slave(sqp_ctx->pd, attr,
-                                    rdma_ah_retrieve_grh(attr)->sgid_index,
-                                    s_mac, vlan_id);
-       if (IS_ERR(ah))
+       ah = rdma_zalloc_drv_obj(sqp_ctx->pd->device, ib_ah);
+       if (!ah)
                return -ENOMEM;
+
+       ah->device = sqp_ctx->pd->device;
+       ah->pd = sqp_ctx->pd;
+
+       /* create ah */
+       ret = mlx4_ib_create_ah_slave(ah, attr,
+                                     rdma_ah_retrieve_grh(attr)->sgid_index,
+                                     s_mac, vlan_id);
+       if (ret)
+               goto out;
+
        spin_lock(&sqp->tx_lock);
        if (sqp->tx_ix_head - sqp->tx_ix_tail >=
            (MLX4_NUM_TUNNEL_BUFS - 1))
@@ -1410,8 +1418,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
                goto out;
 
        sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
-       if (sqp->tx_ring[wire_tx_ix].ah)
-               mlx4_ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah, 0);
+       kfree(sqp->tx_ring[wire_tx_ix].ah);
        sqp->tx_ring[wire_tx_ix].ah = ah;
        ib_dma_sync_single_for_cpu(&dev->ib_dev,
                                   sqp->tx_ring[wire_tx_ix].buf.map,
@@ -1450,7 +1457,7 @@ int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
        spin_unlock(&sqp->tx_lock);
        sqp->tx_ring[wire_tx_ix].ah = NULL;
 out:
-       mlx4_ib_destroy_ah(ah, 0);
+       kfree(ah);
        return ret;
 }
 
@@ -1902,8 +1909,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
                if (wc.status == IB_WC_SUCCESS) {
                        switch (wc.opcode) {
                        case IB_WC_SEND:
-                               mlx4_ib_destroy_ah(sqp->tx_ring[wc.wr_id &
-                                             (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
+                               kfree(sqp->tx_ring[wc.wr_id &
+                                     (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
                                sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
                                        = NULL;
                                spin_lock(&sqp->tx_lock);
@@ -1931,8 +1938,8 @@ static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
                                 " status = %d, wrid = 0x%llx\n",
                                 ctx->slave, wc.status, wc.wr_id);
                        if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
-                               mlx4_ib_destroy_ah(sqp->tx_ring[wc.wr_id &
-                                             (MLX4_NUM_TUNNEL_BUFS - 1)].ah, 0);
+                               kfree(sqp->tx_ring[wc.wr_id &
+                                     (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
                                sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
                                        = NULL;
                                spin_lock(&sqp->tx_lock);
index 733f7bb..25d09d5 100644 (file)
@@ -1177,8 +1177,7 @@ static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
        }
 }
 
-static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                           struct ib_udata *udata)
+static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct mlx4_ib_pd *pd = to_mpd(ibpd);
        struct ib_device *ibdev = ibpd->device;
@@ -1188,20 +1187,19 @@ static int mlx4_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        if (err)
                return err;
 
-       if (context && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
+       if (udata && ib_copy_to_udata(udata, &pd->pdn, sizeof(__u32))) {
                mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
                return -EFAULT;
        }
        return 0;
 }
 
-static void mlx4_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx4_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
 }
 
 static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
-                                         struct ib_ucontext *context,
                                          struct ib_udata *udata)
 {
        struct mlx4_ib_xrcd *xrcd;
@@ -1243,7 +1241,7 @@ err1:
        return ERR_PTR(err);
 }
 
-static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
        ib_destroy_cq(to_mxrcd(xrcd)->cq);
        ib_dealloc_pd(to_mxrcd(xrcd)->pd);
@@ -2560,7 +2558,10 @@ static const struct ib_device_ops mlx4_ib_dev_ops = {
        .req_notify_cq = mlx4_ib_arm_cq,
        .rereg_user_mr = mlx4_ib_rereg_user_mr,
        .resize_cq = mlx4_ib_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, mlx4_ib_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, mlx4_ib_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, mlx4_ib_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx4_ib_ucontext, ibucontext),
 };
 
index 60dc134..2689710 100644 (file)
@@ -492,10 +492,11 @@ struct mlx4_ib_sriov {
        struct mlx4_sriov_alias_guid alias_guid;
 
        /* CM paravirtualization fields */
-       struct list_head cm_list;
+       struct xarray pv_id_table;
+       u32 pv_id_next;
        spinlock_t id_map_lock;
        struct rb_root sl_id_map;
-       struct idr pv_id_table;
+       struct list_head cm_list;
 };
 
 struct gid_cache_context {
@@ -722,8 +723,7 @@ static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
 int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
 void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
 
-int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context,
-                       struct ib_udata *udata, unsigned long virt,
+int mlx4_ib_db_map_user(struct ib_udata *udata, unsigned long virt,
                        struct mlx4_db *db);
 void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
 
@@ -733,43 +733,38 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                  u64 virt_addr, int access_flags,
                                  struct ib_udata *udata);
-int mlx4_ib_dereg_mr(struct ib_mr *mr);
+int mlx4_ib_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
                               struct ib_udata *udata);
 int mlx4_ib_dealloc_mw(struct ib_mw *mw);
-struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
-                              enum ib_mr_type mr_type,
-                              u32 max_num_sg);
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata);
 int mlx4_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset);
 int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
 struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata);
-int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
 void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
 
-struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                               u32 flags, struct ib_udata *udata);
-struct ib_ah *mlx4_ib_create_ah_slave(struct ib_pd *pd,
-                                     struct rdma_ah_attr *ah_attr,
-                                     int slave_sgid_index, u8 *s_mac,
-                                     u16 vlan_tag);
+int mlx4_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+                     struct ib_udata *udata);
+int mlx4_ib_create_ah_slave(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+                           int slave_sgid_index, u8 *s_mac, u16 vlan_tag);
 int mlx4_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags);
+void mlx4_ib_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
-                                 struct ib_srq_init_attr *init_attr,
-                                 struct ib_udata *udata);
+int mlx4_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+                      struct ib_udata *udata);
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
 int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                          const struct ib_recv_wr **bad_wr);
@@ -777,7 +772,7 @@ int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
 struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
                                struct ib_qp_init_attr *init_attr,
                                struct ib_udata *udata);
-int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 void mlx4_ib_drain_sq(struct ib_qp *qp);
 void mlx4_ib_drain_rq(struct ib_qp *qp);
 int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
@@ -912,7 +907,7 @@ void mlx4_ib_sl2vl_update(struct mlx4_ib_dev *mdev, int port);
 struct ib_wq *mlx4_ib_create_wq(struct ib_pd *pd,
                                struct ib_wq_init_attr *init_attr,
                                struct ib_udata *udata);
-int mlx4_ib_destroy_wq(struct ib_wq *wq);
+int mlx4_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx4_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
                      u32 wq_attr_mask, struct ib_udata *udata);
 
index 395379a..355205a 100644 (file)
@@ -595,7 +595,7 @@ mlx4_free_priv_pages(struct mlx4_ib_mr *mr)
        }
 }
 
-int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct mlx4_ib_mr *mr = to_mmr(ibmr);
        int ret;
@@ -655,9 +655,8 @@ int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
        return 0;
 }
 
-struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd,
-                              enum ib_mr_type mr_type,
-                              u32 max_num_sg)
+struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(pd->device);
        struct mlx4_ib_mr *mr;
index 9426936..5221c07 100644 (file)
@@ -1041,11 +1041,11 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
                        goto err_mtt;
 
                if (qp_has_rq(init_attr)) {
-                       err = mlx4_ib_db_map_user(
-                               context, udata,
-                               (src == MLX4_IB_QP_SRC) ? ucmd.qp.db_addr :
+                       err = mlx4_ib_db_map_user(udata,
+                                                 (src == MLX4_IB_QP_SRC) ?
+                                                         ucmd.qp.db_addr :
                                                          ucmd.wq.db_addr,
-                               &qp->db);
+                                                 &qp->db);
                        if (err)
                                goto err_mtt;
                }
@@ -1338,7 +1338,8 @@ static void destroy_qp_rss(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
 }
 
 static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
-                             enum mlx4_ib_source_type src, bool is_user)
+                             enum mlx4_ib_source_type src,
+                             struct ib_udata *udata)
 {
        struct mlx4_ib_cq *send_cq, *recv_cq;
        unsigned long flags;
@@ -1380,7 +1381,7 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
        list_del(&qp->qps_list);
        list_del(&qp->cq_send_list);
        list_del(&qp->cq_recv_list);
-       if (!is_user) {
+       if (!udata) {
                __mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
                                 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
                if (send_cq != recv_cq)
@@ -1398,19 +1399,26 @@ static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
                if (qp->flags & MLX4_IB_QP_NETIF)
                        mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
                else if (src == MLX4_IB_RWQ_SRC)
-                       mlx4_ib_release_wqn(to_mucontext(
-                                           qp->ibwq.uobject->context), qp, 1);
+                       mlx4_ib_release_wqn(
+                               rdma_udata_to_drv_context(
+                                       udata,
+                                       struct mlx4_ib_ucontext,
+                                       ibucontext),
+                               qp, 1);
                else
                        mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
        }
 
        mlx4_mtt_cleanup(dev->dev, &qp->mtt);
 
-       if (is_user) {
+       if (udata) {
                if (qp->rq.wqe_cnt) {
-                       struct mlx4_ib_ucontext *mcontext = !src ?
-                               to_mucontext(qp->ibqp.uobject->context) :
-                               to_mucontext(qp->ibwq.uobject->context);
+                       struct mlx4_ib_ucontext *mcontext =
+                               rdma_udata_to_drv_context(
+                                       udata,
+                                       struct mlx4_ib_ucontext,
+                                       ibucontext);
+
                        mlx4_ib_db_unmap_user(mcontext, &qp->db);
                }
                ib_umem_release(qp->umem);
@@ -1594,7 +1602,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
        return ibqp;
 }
 
-static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
+static int _mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(qp->device);
        struct mlx4_ib_qp *mqp = to_mqp(qp);
@@ -1615,7 +1623,7 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
        if (qp->rwq_ind_tbl) {
                destroy_qp_rss(dev, mqp);
        } else {
-               destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, qp->uobject);
+               destroy_qp_common(dev, mqp, MLX4_IB_QP_SRC, udata);
        }
 
        if (is_sqp(dev, mqp))
@@ -1626,7 +1634,7 @@ static int _mlx4_ib_destroy_qp(struct ib_qp *qp)
        return 0;
 }
 
-int mlx4_ib_destroy_qp(struct ib_qp *qp)
+int mlx4_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
        struct mlx4_ib_qp *mqp = to_mqp(qp);
 
@@ -1637,7 +1645,7 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp)
                        ib_destroy_qp(sqp->roce_v2_gsi);
        }
 
-       return _mlx4_ib_destroy_qp(qp);
+       return _mlx4_ib_destroy_qp(qp, udata);
 }
 
 static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
@@ -2240,8 +2248,10 @@ static int __mlx4_ib_modify_qp(void *src, enum mlx4_ib_source_type src_type,
 
                if (is_eth) {
                        gid_attr = attr->ah_attr.grh.sgid_attr;
-                       vlan = rdma_vlan_dev_vlan_id(gid_attr->ndev);
-                       memcpy(smac, gid_attr->ndev->dev_addr, ETH_ALEN);
+                       err = rdma_read_gid_l2_fields(gid_attr, &vlan,
+                                                     &smac[0]);
+                       if (err)
+                               goto out;
                }
 
                if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
@@ -4238,7 +4248,7 @@ int mlx4_ib_modify_wq(struct ib_wq *ibwq, struct ib_wq_attr *wq_attr,
        return err;
 }
 
-int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
+int mlx4_ib_destroy_wq(struct ib_wq *ibwq, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(ibwq->device);
        struct mlx4_ib_qp *qp = to_mqp((struct ib_qp *)ibwq);
@@ -4246,7 +4256,7 @@ int mlx4_ib_destroy_wq(struct ib_wq *ibwq)
        if (qp->counter_index)
                mlx4_ib_free_qp_counter(dev, qp);
 
-       destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, 1);
+       destroy_qp_common(dev, qp, MLX4_IB_RWQ_SRC, udata);
 
        kfree(qp);
 
index 381cf89..4bf2946 100644 (file)
@@ -69,14 +69,14 @@ static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
        }
 }
 
-struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
-                                 struct ib_srq_init_attr *init_attr,
-                                 struct ib_udata *udata)
+int mlx4_ib_create_srq(struct ib_srq *ib_srq,
+                      struct ib_srq_init_attr *init_attr,
+                      struct ib_udata *udata)
 {
-       struct mlx4_ib_dev *dev = to_mdev(pd->device);
+       struct mlx4_ib_dev *dev = to_mdev(ib_srq->device);
        struct mlx4_ib_ucontext *ucontext = rdma_udata_to_drv_context(
                udata, struct mlx4_ib_ucontext, ibucontext);
-       struct mlx4_ib_srq *srq;
+       struct mlx4_ib_srq *srq = to_msrq(ib_srq);
        struct mlx4_wqe_srq_next_seg *next;
        struct mlx4_wqe_data_seg *scatter;
        u32 cqn;
@@ -89,11 +89,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
        /* Sanity check SRQ size before proceeding */
        if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
            init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
-               return ERR_PTR(-EINVAL);
-
-       srq = kmalloc(sizeof *srq, GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        mutex_init(&srq->mutex);
        spin_lock_init(&srq->lock);
@@ -111,16 +107,12 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
        if (udata) {
                struct mlx4_ib_create_srq ucmd;
 
-               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
-                       err = -EFAULT;
-                       goto err_srq;
-               }
+               if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+                       return -EFAULT;
 
                srq->umem = ib_umem_get(udata, ucmd.buf_addr, buf_size, 0, 0);
-               if (IS_ERR(srq->umem)) {
-                       err = PTR_ERR(srq->umem);
-                       goto err_srq;
-               }
+               if (IS_ERR(srq->umem))
+                       return PTR_ERR(srq->umem);
 
                err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
                                    srq->umem->page_shift, &srq->mtt);
@@ -131,14 +123,13 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
                if (err)
                        goto err_mtt;
 
-               err = mlx4_ib_db_map_user(ucontext, udata, ucmd.db_addr,
-                                         &srq->db);
+               err = mlx4_ib_db_map_user(udata, ucmd.db_addr, &srq->db);
                if (err)
                        goto err_mtt;
        } else {
                err = mlx4_db_alloc(dev->dev, &srq->db, 0);
                if (err)
-                       goto err_srq;
+                       return err;
 
                *srq->db.db = 0;
 
@@ -185,8 +176,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
        xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
                to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
                (u16) dev->dev->caps.reserved_xrcds;
-       err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
-                            srq->db.dma, &srq->msrq);
+       err = mlx4_srq_alloc(dev->dev, to_mpd(ib_srq->pd)->pdn, cqn, xrcdn,
+                            &srq->mtt, srq->db.dma, &srq->msrq);
        if (err)
                goto err_wrid;
 
@@ -201,7 +192,7 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
 
        init_attr->attr.max_wr = srq->msrq.max - 1;
 
-       return &srq->ibsrq;
+       return 0;
 
 err_wrid:
        if (udata)
@@ -222,10 +213,7 @@ err_db:
        if (!udata)
                mlx4_db_free(dev->dev, &srq->db);
 
-err_srq:
-       kfree(srq);
-
-       return ERR_PTR(err);
+       return err;
 }
 
 int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -272,7 +260,7 @@ int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
        return 0;
 }
 
-int mlx4_ib_destroy_srq(struct ib_srq *srq)
+void mlx4_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
        struct mlx4_ib_dev *dev = to_mdev(srq->device);
        struct mlx4_ib_srq *msrq = to_msrq(srq);
@@ -280,8 +268,13 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq)
        mlx4_srq_free(dev->dev, &msrq->msrq);
        mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
 
-       if (srq->uobject) {
-               mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+       if (udata) {
+               mlx4_ib_db_unmap_user(
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mlx4_ib_ucontext,
+                               ibucontext),
+                       &msrq->db);
                ib_umem_release(msrq->umem);
        } else {
                kvfree(msrq->wrid);
@@ -289,10 +282,6 @@ int mlx4_ib_destroy_srq(struct ib_srq *srq)
                              &msrq->buf);
                mlx4_db_free(dev->dev, &msrq->db);
        }
-
-       kfree(msrq);
-
-       return 0;
 }
 
 void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
index 420ae08..80642dd 100644 (file)
@@ -32,9 +32,8 @@
 
 #include "mlx5_ib.h"
 
-static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
-                                 struct mlx5_ib_ah *ah,
-                                 struct rdma_ah_attr *ah_attr)
+static void create_ib_ah(struct mlx5_ib_dev *dev, struct mlx5_ib_ah *ah,
+                        struct rdma_ah_attr *ah_attr)
 {
        enum ib_gid_type gid_type;
 
@@ -67,21 +66,19 @@ static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev,
                ah->av.fl_mlid = rdma_ah_get_path_bits(ah_attr) & 0x7f;
                ah->av.stat_rate_sl |= (rdma_ah_get_sl(ah_attr) & 0xf);
        }
-
-       return &ah->ibah;
 }
 
-struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                               u32 flags, struct ib_udata *udata)
+int mlx5_ib_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+                     u32 flags, struct ib_udata *udata)
 
 {
-       struct mlx5_ib_ah *ah;
-       struct mlx5_ib_dev *dev = to_mdev(pd->device);
+       struct mlx5_ib_ah *ah = to_mah(ibah);
+       struct mlx5_ib_dev *dev = to_mdev(ibah->device);
        enum rdma_ah_attr_type ah_type = ah_attr->type;
 
        if ((ah_type == RDMA_AH_ATTR_TYPE_ROCE) &&
            !(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (ah_type == RDMA_AH_ATTR_TYPE_ROCE && udata) {
                int err;
@@ -90,21 +87,18 @@ struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
                                   sizeof(resp.dmac);
 
                if (udata->outlen < min_resp_len)
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
 
                resp.response_length = min_resp_len;
 
                memcpy(resp.dmac, ah_attr->roce.dmac, ETH_ALEN);
                err = ib_copy_to_udata(udata, &resp, resp.response_length);
                if (err)
-                       return ERR_PTR(err);
+                       return err;
        }
 
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
-
-       return create_ib_ah(dev, ah, ah_attr); /* never fails */
+       create_ib_ah(dev, ah, ah_attr);
+       return 0;
 }
 
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
@@ -131,8 +125,7 @@ int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr)
        return 0;
 }
 
-int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
+void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags)
 {
-       kfree(to_mah(ah));
-       return 0;
+       return;
 }
index be95ac5..e3ec79b 100644 (file)
@@ -82,10 +82,10 @@ int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *dev,
        return mlx5_cmd_exec(dev, in, in_size, out, sizeof(out));
 }
 
-int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
-                         u64 length, u32 alignment)
+int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
+                        u64 length, u32 alignment)
 {
-       struct mlx5_core_dev *dev = memic->dev;
+       struct mlx5_core_dev *dev = dm->dev;
        u64 num_memic_hw_pages = MLX5_CAP_DEV_MEM(dev, memic_bar_size)
                                        >> PAGE_SHIFT;
        u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
@@ -115,17 +115,17 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
                 mlx5_alignment);
 
        while (page_idx < num_memic_hw_pages) {
-               spin_lock(&memic->memic_lock);
-               page_idx = bitmap_find_next_zero_area(memic->memic_alloc_pages,
+               spin_lock(&dm->lock);
+               page_idx = bitmap_find_next_zero_area(dm->memic_alloc_pages,
                                                      num_memic_hw_pages,
                                                      page_idx,
                                                      num_pages, 0);
 
                if (page_idx < num_memic_hw_pages)
-                       bitmap_set(memic->memic_alloc_pages,
+                       bitmap_set(dm->memic_alloc_pages,
                                   page_idx, num_pages);
 
-               spin_unlock(&memic->memic_lock);
+               spin_unlock(&dm->lock);
 
                if (page_idx >= num_memic_hw_pages)
                        break;
@@ -135,10 +135,10 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
 
                ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
                if (ret) {
-                       spin_lock(&memic->memic_lock);
-                       bitmap_clear(memic->memic_alloc_pages,
+                       spin_lock(&dm->lock);
+                       bitmap_clear(dm->memic_alloc_pages,
                                     page_idx, num_pages);
-                       spin_unlock(&memic->memic_lock);
+                       spin_unlock(&dm->lock);
 
                        if (ret == -EAGAIN) {
                                page_idx++;
@@ -157,9 +157,9 @@ int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
        return -ENOMEM;
 }
 
-int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length)
+int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length)
 {
-       struct mlx5_core_dev *dev = memic->dev;
+       struct mlx5_core_dev *dev = dm->dev;
        u64 hw_start_addr = MLX5_CAP64_DEV_MEM(dev, memic_bar_start_addr);
        u32 num_pages = DIV_ROUND_UP(length, PAGE_SIZE);
        u32 out[MLX5_ST_SZ_DW(dealloc_memic_out)] = {0};
@@ -177,15 +177,140 @@ int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length)
        err =  mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
 
        if (!err) {
-               spin_lock(&memic->memic_lock);
-               bitmap_clear(memic->memic_alloc_pages,
+               spin_lock(&dm->lock);
+               bitmap_clear(dm->memic_alloc_pages,
                             start_page_idx, num_pages);
-               spin_unlock(&memic->memic_lock);
+               spin_unlock(&dm->lock);
        }
 
        return err;
 }
 
+int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
+                         u16 uid, phys_addr_t *addr, u32 *obj_id)
+{
+       struct mlx5_core_dev *dev = dm->dev;
+       u32 num_blocks = DIV_ROUND_UP(length, MLX5_SW_ICM_BLOCK_SIZE(dev));
+       u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+       u32 in[MLX5_ST_SZ_DW(create_sw_icm_in)] = {};
+       unsigned long *block_map;
+       u64 icm_start_addr;
+       u32 log_icm_size;
+       u32 max_blocks;
+       u64 block_idx;
+       void *sw_icm;
+       int ret;
+
+       MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
+                MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
+       MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM);
+       MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid);
+
+       switch (type) {
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+               icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
+                                               steering_sw_icm_start_address);
+               log_icm_size = MLX5_CAP_DEV_MEM(dev, log_steering_sw_icm_size);
+               block_map = dm->steering_sw_icm_alloc_blocks;
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               icm_start_addr = MLX5_CAP64_DEV_MEM(dev,
+                                       header_modify_sw_icm_start_address);
+               log_icm_size = MLX5_CAP_DEV_MEM(dev,
+                                               log_header_modify_sw_icm_size);
+               block_map = dm->header_modify_sw_icm_alloc_blocks;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       max_blocks = BIT(log_icm_size - MLX5_LOG_SW_ICM_BLOCK_SIZE(dev));
+       spin_lock(&dm->lock);
+       block_idx = bitmap_find_next_zero_area(block_map,
+                                              max_blocks,
+                                              0,
+                                              num_blocks, 0);
+
+       if (block_idx < max_blocks)
+               bitmap_set(block_map,
+                          block_idx, num_blocks);
+
+       spin_unlock(&dm->lock);
+
+       if (block_idx >= max_blocks)
+               return -ENOMEM;
+
+       sw_icm = MLX5_ADDR_OF(create_sw_icm_in, in, sw_icm);
+       icm_start_addr += block_idx << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
+       MLX5_SET64(sw_icm, sw_icm, sw_icm_start_addr,
+                  icm_start_addr);
+       MLX5_SET(sw_icm, sw_icm, log_sw_icm_size, ilog2(length));
+
+       ret = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+       if (ret) {
+               spin_lock(&dm->lock);
+               bitmap_clear(block_map,
+                            block_idx, num_blocks);
+               spin_unlock(&dm->lock);
+
+               return ret;
+       }
+
+       *addr = icm_start_addr;
+       *obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
+
+       return 0;
+}
+
+int mlx5_cmd_dealloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
+                           u16 uid, phys_addr_t addr, u32 obj_id)
+{
+       struct mlx5_core_dev *dev = dm->dev;
+       u32 num_blocks = DIV_ROUND_UP(length, MLX5_SW_ICM_BLOCK_SIZE(dev));
+       u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
+       u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
+       unsigned long *block_map;
+       u64 start_idx;
+       int err;
+
+       switch (type) {
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+               start_idx =
+                       (addr - MLX5_CAP64_DEV_MEM(
+                                       dev, steering_sw_icm_start_address)) >>
+                       MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
+               block_map = dm->steering_sw_icm_alloc_blocks;
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               start_idx =
+                       (addr -
+                        MLX5_CAP64_DEV_MEM(
+                                dev, header_modify_sw_icm_start_address)) >>
+                       MLX5_LOG_SW_ICM_BLOCK_SIZE(dev);
+               block_map = dm->header_modify_sw_icm_alloc_blocks;
+               break;
+       default:
+               return -EINVAL;
+       }
+
+       MLX5_SET(general_obj_in_cmd_hdr, in, opcode,
+                MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
+       MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_SW_ICM);
+       MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, obj_id);
+       MLX5_SET(general_obj_in_cmd_hdr, in, uid, uid);
+
+       err =  mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+       if (err)
+               return err;
+
+       spin_lock(&dm->lock);
+       bitmap_clear(block_map,
+                    start_idx, num_blocks);
+       spin_unlock(&dm->lock);
+
+       return 0;
+}
+
 int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out)
 {
        u32 in[MLX5_ST_SZ_DW(ppcnt_reg)] = {};
index 923a7b9..0572dcb 100644 (file)
@@ -44,9 +44,9 @@ int mlx5_cmd_query_cong_params(struct mlx5_core_dev *dev, int cong_point,
 int mlx5_cmd_query_ext_ppcnt_counters(struct mlx5_core_dev *dev, void *out);
 int mlx5_cmd_modify_cong_params(struct mlx5_core_dev *mdev,
                                void *in, int in_size);
-int mlx5_cmd_alloc_memic(struct mlx5_memic *memic, phys_addr_t *addr,
+int mlx5_cmd_alloc_memic(struct mlx5_dm *dm, phys_addr_t *addr,
                         u64 length, u32 alignment);
-int mlx5_cmd_dealloc_memic(struct mlx5_memic *memic, u64 addr, u64 length);
+int mlx5_cmd_dealloc_memic(struct mlx5_dm *dm, phys_addr_t addr, u64 length);
 void mlx5_cmd_dealloc_pd(struct mlx5_core_dev *dev, u32 pdn, u16 uid);
 void mlx5_cmd_destroy_tir(struct mlx5_core_dev *dev, u32 tirn, u16 uid);
 void mlx5_cmd_destroy_tis(struct mlx5_core_dev *dev, u32 tisn, u16 uid);
@@ -65,4 +65,8 @@ int mlx5_cmd_alloc_q_counter(struct mlx5_core_dev *dev, u16 *counter_id,
                             u16 uid);
 int mlx5_cmd_mad_ifc(struct mlx5_core_dev *dev, const void *inb, void *outb,
                     u16 opmod, u8 port);
+int mlx5_cmd_alloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
+                         u16 uid, phys_addr_t *addr, u32 *obj_id);
+int mlx5_cmd_dealloc_sw_icm(struct mlx5_dm *dm, int type, u64 length,
+                           u16 uid, phys_addr_t addr, u32 obj_id);
 #endif /* MLX5_IB_CMD_H */
index 18704e5..2e2e65f 100644 (file)
@@ -679,8 +679,7 @@ static int mini_cqe_res_format_to_hw(struct mlx5_ib_dev *dev, u8 format)
 }
 
 static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
-                         struct ib_ucontext *context, struct mlx5_ib_cq *cq,
-                         int entries, u32 **cqb,
+                         struct mlx5_ib_cq *cq, int entries, u32 **cqb,
                          int *cqe_size, int *index, int *inlen)
 {
        struct mlx5_ib_create_cq ucmd = {};
@@ -691,6 +690,8 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
        int ncont;
        void *cqc;
        int err;
+       struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
 
        ucmdlen = udata->inlen < sizeof(ucmd) ?
                  (sizeof(ucmd) - sizeof(ucmd.flags)) : sizeof(ucmd);
@@ -715,8 +716,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
                return err;
        }
 
-       err = mlx5_ib_db_map_user(to_mucontext(context), udata, ucmd.db_addr,
-                                 &cq->db);
+       err = mlx5_ib_db_map_user(context, udata, ucmd.db_addr, &cq->db);
        if (err)
                goto err_umem;
 
@@ -740,7 +740,7 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
        MLX5_SET(cqc, cqc, log_page_size,
                 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
 
-       *index = to_mucontext(context)->bfregi.sys_pages[0];
+       *index = context->bfregi.sys_pages[0];
 
        if (ucmd.cqe_comp_en == 1) {
                int mini_cqe_format;
@@ -782,23 +782,26 @@ static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
                cq->private_flags |= MLX5_IB_CQ_PR_FLAGS_CQE_128_PAD;
        }
 
-       MLX5_SET(create_cq_in, *cqb, uid, to_mucontext(context)->devx_uid);
+       MLX5_SET(create_cq_in, *cqb, uid, context->devx_uid);
        return 0;
 
 err_cqb:
        kvfree(*cqb);
 
 err_db:
-       mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+       mlx5_ib_db_unmap_user(context, &cq->db);
 
 err_umem:
        ib_umem_release(cq->buf.umem);
        return err;
 }
 
-static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_udata *udata)
 {
-       mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+       struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
+
+       mlx5_ib_db_unmap_user(context, &cq->db);
        ib_umem_release(cq->buf.umem);
 }
 
@@ -883,7 +886,6 @@ static void notify_soft_wc_handler(struct work_struct *work)
 
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -923,9 +925,9 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
        INIT_LIST_HEAD(&cq->list_send_qp);
        INIT_LIST_HEAD(&cq->list_recv_qp);
 
-       if (context) {
-               err = create_cq_user(dev, udata, context, cq, entries,
-                                    &cqb, &cqe_size, &index, &inlen);
+       if (udata) {
+               err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
+                                    &index, &inlen);
                if (err)
                        goto err_create;
        } else {
@@ -962,7 +964,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
        mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
        cq->mcq.irqn = irqn;
-       if (context)
+       if (udata)
                cq->mcq.tasklet_ctx.comp = mlx5_ib_cq_comp;
        else
                cq->mcq.comp  = mlx5_ib_cq_comp;
@@ -970,7 +972,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
 
        INIT_LIST_HEAD(&cq->wc_list);
 
-       if (context)
+       if (udata)
                if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
                        err = -EFAULT;
                        goto err_cmd;
@@ -985,8 +987,8 @@ err_cmd:
 
 err_cqb:
        kvfree(cqb);
-       if (context)
-               destroy_cq_user(cq, context);
+       if (udata)
+               destroy_cq_user(cq, udata);
        else
                destroy_cq_kernel(dev, cq);
 
@@ -996,19 +998,14 @@ err_create:
        return ERR_PTR(err);
 }
 
-
-int mlx5_ib_destroy_cq(struct ib_cq *cq)
+int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(cq->device);
        struct mlx5_ib_cq *mcq = to_mcq(cq);
-       struct ib_ucontext *context = NULL;
-
-       if (cq->uobject)
-               context = cq->uobject->context;
 
        mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
-       if (context)
-               destroy_cq_user(mcq, context);
+       if (udata)
+               destroy_cq_user(mcq, udata);
        else
                destroy_cq_kernel(dev, mcq);
 
index 9e08df7..169ffff 100644 (file)
@@ -85,6 +85,10 @@ int mlx5_ib_devx_create(struct mlx5_ib_dev *dev, bool is_user)
        if (is_user && capable(CAP_NET_RAW) &&
            (MLX5_CAP_GEN(dev->mdev, uctx_cap) & MLX5_UCTX_CAP_RAW_TX))
                cap |= MLX5_UCTX_CAP_RAW_TX;
+       if (is_user && capable(CAP_SYS_RAWIO) &&
+           (MLX5_CAP_GEN(dev->mdev, uctx_cap) &
+            MLX5_UCTX_CAP_INTERNAL_DEV_RES))
+               cap |= MLX5_UCTX_CAP_INTERNAL_DEV_RES;
 
        MLX5_SET(create_uctx_in, in, opcode, MLX5_CMD_OP_CREATE_UCTX);
        MLX5_SET(uctx, uctx, cap, cap);
@@ -373,8 +377,10 @@ static u64 devx_get_obj_id(const void *in)
        return obj_id;
 }
 
-static bool devx_is_valid_obj_id(struct ib_uobject *uobj, const void *in)
+static bool devx_is_valid_obj_id(struct uverbs_attr_bundle *attrs,
+                                struct ib_uobject *uobj, const void *in)
 {
+       struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
        u64 obj_id = devx_get_obj_id(in);
 
        if (!obj_id)
@@ -389,7 +395,6 @@ static bool devx_is_valid_obj_id(struct ib_uobject *uobj, const void *in)
        case UVERBS_OBJECT_SRQ:
        {
                struct mlx5_core_srq *srq = &(to_msrq(uobj->object)->msrq);
-               struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
                u16 opcode;
 
                switch (srq->common.res) {
@@ -681,6 +686,7 @@ static bool devx_is_whitelist_cmd(void *in)
        switch (opcode) {
        case MLX5_CMD_OP_QUERY_HCA_CAP:
        case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+       case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
                return true;
        default:
                return false;
@@ -718,6 +724,7 @@ static bool devx_is_general_cmd(void *in)
        switch (opcode) {
        case MLX5_CMD_OP_QUERY_HCA_CAP:
        case MLX5_CMD_OP_QUERY_HCA_VPORT_CONTEXT:
+       case MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT:
        case MLX5_CMD_OP_QUERY_VPORT_STATE:
        case MLX5_CMD_OP_QUERY_ADAPTER:
        case MLX5_CMD_OP_QUERY_ISSI:
@@ -1117,7 +1124,8 @@ static void devx_cleanup_mkey(struct devx_obj *obj)
 }
 
 static int devx_obj_cleanup(struct ib_uobject *uobject,
-                           enum rdma_remove_reason why)
+                           enum rdma_remove_reason why,
+                           struct uverbs_attr_bundle *attrs)
 {
        u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
        struct devx_obj *obj = uobject->object;
@@ -1135,7 +1143,8 @@ static int devx_obj_cleanup(struct ib_uobject *uobject,
                return ret;
 
        if (obj->flags & DEVX_OBJ_FLAGS_INDIRECT_MKEY) {
-               struct mlx5_ib_dev *dev = to_mdev(uobject->context->device);
+               struct mlx5_ib_dev *dev =
+                       mlx5_udata_to_mdev(&attrs->driver_udata);
 
                call_srcu(&dev->mr_srcu, &obj->devx_mr.rcu,
                          devx_free_indirect_mkey);
@@ -1260,7 +1269,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_MODIFY)(
        if (!devx_is_obj_modify_cmd(cmd_in))
                return -EINVAL;
 
-       if (!devx_is_valid_obj_id(uobj, cmd_in))
+       if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
                return -EINVAL;
 
        cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -1302,7 +1311,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_QUERY)(
        if (!devx_is_obj_query_cmd(cmd_in))
                return -EINVAL;
 
-       if (!devx_is_valid_obj_id(uobj, cmd_in))
+       if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
                return -EINVAL;
 
        cmd_out = uverbs_zalloc(attrs, cmd_out_len);
@@ -1350,7 +1359,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_ASYNC_CMD_FD_ALLOC)(
 
        struct ib_uobject *uobj = uverbs_attr_get_uobject(
                attrs, MLX5_IB_ATTR_DEVX_ASYNC_CMD_FD_ALLOC_HANDLE);
-       struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+       struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
 
        ev_file = container_of(uobj, struct devx_async_cmd_event_file,
                               uobj);
@@ -1412,7 +1421,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_DEVX_OBJ_ASYNC_QUERY)(
        if (err)
                return err;
 
-       if (!devx_is_valid_obj_id(uobj, cmd_in))
+       if (!devx_is_valid_obj_id(attrs, uobj, cmd_in))
                return -EINVAL;
 
        fd_uobj = uverbs_attr_get_uobject(attrs,
@@ -1599,7 +1608,8 @@ err_obj_free:
 }
 
 static int devx_umem_cleanup(struct ib_uobject *uobject,
-                            enum rdma_remove_reason why)
+                            enum rdma_remove_reason why,
+                            struct uverbs_attr_bundle *attrs)
 {
        struct devx_umem *obj = uobject->object;
        u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
@@ -1704,7 +1714,7 @@ static __poll_t devx_async_cmd_event_poll(struct file *filp,
        return pollflags;
 }
 
-const struct file_operations devx_async_cmd_event_fops = {
+static const struct file_operations devx_async_cmd_event_fops = {
        .owner   = THIS_MODULE,
        .read    = devx_async_cmd_event_read,
        .poll    = devx_async_cmd_event_poll,
@@ -1900,7 +1910,7 @@ static bool devx_is_supported(struct ib_device *device)
 {
        struct mlx5_ib_dev *dev = to_mdev(device);
 
-       return !dev->rep && MLX5_CAP_GEN(dev->mdev, log_max_uctx);
+       return MLX5_CAP_GEN(dev->mdev, log_max_uctx);
 }
 
 const struct uapi_definition mlx5_ib_devx_defs[] = {
index 798591a..1fc302d 100644 (file)
@@ -29,6 +29,9 @@ mlx5_ib_ft_type_to_namespace(enum mlx5_ib_uapi_flow_table_type table_type,
        case MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX:
                *namespace = MLX5_FLOW_NAMESPACE_EGRESS;
                break;
+       case MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB:
+               *namespace = MLX5_FLOW_NAMESPACE_FDB;
+               break;
        default:
                return -EINVAL;
        }
@@ -75,7 +78,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
        struct ib_qp *qp = NULL;
        struct ib_uobject *uobj =
                uverbs_attr_get_uobject(attrs, MLX5_IB_ATTR_CREATE_FLOW_HANDLE);
-       struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+       struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
        int len, ret, i;
        u32 counter_id = 0;
 
@@ -93,6 +96,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
            ((dest_devx && dest_qp) || (!dest_devx && !dest_qp)))
                return -EINVAL;
 
+       /* Allow only DEVX object as dest when inserting to FDB */
+       if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB && !dest_devx)
+               return -EINVAL;
+
        if (dest_devx) {
                devx_obj = uverbs_attr_get_obj(
                        attrs, MLX5_IB_ATTR_CREATE_FLOW_DEST_DEVX);
@@ -104,6 +111,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_CREATE_FLOW)(
                 */
                if (!mlx5_ib_devx_is_flow_dest(devx_obj, &dest_id, &dest_type))
                        return -EINVAL;
+               /* Allow only flow table as dest when inserting to FDB */
+               if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB &&
+                   dest_type != MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE)
+                       return -EINVAL;
        } else if (dest_qp) {
                struct mlx5_ib_qp *mqp;
 
@@ -189,7 +200,8 @@ err_out:
 }
 
 static int flow_matcher_cleanup(struct ib_uobject *uobject,
-                               enum rdma_remove_reason why)
+                               enum rdma_remove_reason why,
+                               struct uverbs_attr_bundle *attrs)
 {
        struct mlx5_ib_flow_matcher *obj = uobject->object;
        int ret;
@@ -202,21 +214,67 @@ static int flow_matcher_cleanup(struct ib_uobject *uobject,
        return 0;
 }
 
+static int mlx5_ib_matcher_ns(struct uverbs_attr_bundle *attrs,
+                             struct mlx5_ib_flow_matcher *obj)
+{
+       enum mlx5_ib_uapi_flow_table_type ft_type =
+               MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX;
+       u32 flags;
+       int err;
+
+       /* New users should use MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE and older
+        * users should switch to it. We leave this to not break userspace
+        */
+       if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE) &&
+           uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS))
+               return -EINVAL;
+
+       if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE)) {
+               err = uverbs_get_const(&ft_type, attrs,
+                                      MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE);
+               if (err)
+                       return err;
+
+               err = mlx5_ib_ft_type_to_namespace(ft_type, &obj->ns_type);
+               if (err)
+                       return err;
+
+               return 0;
+       }
+
+       if (uverbs_attr_is_valid(attrs, MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS)) {
+               err = uverbs_get_flags32(&flags, attrs,
+                                        MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+                                        IB_FLOW_ATTR_FLAGS_EGRESS);
+               if (err)
+                       return err;
+
+               if (flags) {
+                       mlx5_ib_ft_type_to_namespace(
+                               MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX,
+                               &obj->ns_type);
+                       return 0;
+               }
+       }
+
+       obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
+
+       return 0;
+}
+
 static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
        struct uverbs_attr_bundle *attrs)
 {
        struct ib_uobject *uobj = uverbs_attr_get_uobject(
                attrs, MLX5_IB_ATTR_FLOW_MATCHER_CREATE_HANDLE);
-       struct mlx5_ib_dev *dev = to_mdev(uobj->context->device);
+       struct mlx5_ib_dev *dev = mlx5_udata_to_mdev(&attrs->driver_udata);
        struct mlx5_ib_flow_matcher *obj;
-       u32 flags;
        int err;
 
        obj = kzalloc(sizeof(struct mlx5_ib_flow_matcher), GFP_KERNEL);
        if (!obj)
                return -ENOMEM;
 
-       obj->ns_type = MLX5_FLOW_NAMESPACE_BYPASS;
        obj->mask_len = uverbs_attr_get_len(
                attrs, MLX5_IB_ATTR_FLOW_MATCHER_MATCH_MASK);
        err = uverbs_copy_from(&obj->matcher_mask,
@@ -242,19 +300,10 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_MATCHER_CREATE)(
        if (err)
                goto end;
 
-       err = uverbs_get_flags32(&flags, attrs,
-                                MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
-                                IB_FLOW_ATTR_FLAGS_EGRESS);
+       err = mlx5_ib_matcher_ns(attrs, obj);
        if (err)
                goto end;
 
-       if (flags) {
-               err = mlx5_ib_ft_type_to_namespace(
-                       MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX, &obj->ns_type);
-               if (err)
-                       goto end;
-       }
-
        uobj->object = obj;
        obj->mdev = dev->mdev;
        atomic_set(&obj->usecnt, 0);
@@ -326,7 +375,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
 {
        struct ib_uobject *uobj = uverbs_attr_get_uobject(
                attrs, MLX5_IB_ATTR_CREATE_MODIFY_HEADER_HANDLE);
-       struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+       struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
        enum mlx5_ib_uapi_flow_table_type ft_type;
        struct ib_flow_action *action;
        int num_actions;
@@ -353,7 +402,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_MODIFY_HEADER)(
        if (IS_ERR(action))
                return PTR_ERR(action);
 
-       uverbs_flow_action_fill_action(action, uobj, uobj->context->device,
+       uverbs_flow_action_fill_action(action, uobj, &mdev->ib_dev,
                                       IB_FLOW_ACTION_UNSPECIFIED);
 
        return 0;
@@ -445,7 +494,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
 {
        struct ib_uobject *uobj = uverbs_attr_get_uobject(attrs,
                MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_HANDLE);
-       struct mlx5_ib_dev *mdev = to_mdev(uobj->context->device);
+       struct mlx5_ib_dev *mdev = mlx5_udata_to_mdev(&attrs->driver_udata);
        enum mlx5_ib_uapi_flow_action_packet_reformat_type dv_prt;
        enum mlx5_ib_uapi_flow_table_type ft_type;
        struct mlx5_ib_flow_action *maction;
@@ -493,8 +542,7 @@ static int UVERBS_HANDLER(MLX5_IB_METHOD_FLOW_ACTION_CREATE_PACKET_REFORMAT)(
                        goto free_maction;
        }
 
-       uverbs_flow_action_fill_action(&maction->ib_action, uobj,
-                                      uobj->context->device,
+       uverbs_flow_action_fill_action(&maction->ib_action, uobj, &mdev->ib_dev,
                                       IB_FLOW_ACTION_UNSPECIFIED);
        return 0;
 
@@ -605,6 +653,9 @@ DECLARE_UVERBS_NAMED_METHOD(
                           UA_MANDATORY),
        UVERBS_ATTR_FLAGS_IN(MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
                             enum ib_flow_flags,
+                            UA_OPTIONAL),
+       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
+                            enum mlx5_ib_uapi_flow_table_type,
                             UA_OPTIONAL));
 
 DECLARE_UVERBS_NAMED_METHOD_DESTROY(
@@ -619,15 +670,9 @@ DECLARE_UVERBS_NAMED_OBJECT(MLX5_IB_OBJECT_FLOW_MATCHER,
                            &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_CREATE),
                            &UVERBS_METHOD(MLX5_IB_METHOD_FLOW_MATCHER_DESTROY));
 
-static bool flow_is_supported(struct ib_device *device)
-{
-       return !to_mdev(device)->rep;
-}
-
 const struct uapi_definition mlx5_ib_flow_defs[] = {
        UAPI_DEF_CHAIN_OBJ_TREE_NAMED(
-               MLX5_IB_OBJECT_FLOW_MATCHER,
-               UAPI_DEF_IS_OBJ_SUPPORTED(flow_is_supported)),
+               MLX5_IB_OBJECT_FLOW_MATCHER),
        UAPI_DEF_CHAIN_OBJ_TREE(
                UVERBS_OBJECT_FLOW,
                &mlx5_ib_fs),
index b8639ac..cbcc40d 100644 (file)
@@ -7,69 +7,59 @@
 #include "ib_rep.h"
 #include "srq.h"
 
-static const struct mlx5_ib_profile vf_rep_profile = {
-       STAGE_CREATE(MLX5_IB_STAGE_INIT,
-                    mlx5_ib_stage_init_init,
-                    mlx5_ib_stage_init_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_FLOW_DB,
-                    mlx5_ib_stage_rep_flow_db_init,
-                    NULL),
-       STAGE_CREATE(MLX5_IB_STAGE_CAPS,
-                    mlx5_ib_stage_caps_init,
-                    NULL),
-       STAGE_CREATE(MLX5_IB_STAGE_NON_DEFAULT_CB,
-                    mlx5_ib_stage_rep_non_default_cb,
-                    NULL),
-       STAGE_CREATE(MLX5_IB_STAGE_ROCE,
-                    mlx5_ib_stage_rep_roce_init,
-                    mlx5_ib_stage_rep_roce_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_SRQ,
-                    mlx5_init_srq_table,
-                    mlx5_cleanup_srq_table),
-       STAGE_CREATE(MLX5_IB_STAGE_DEVICE_RESOURCES,
-                    mlx5_ib_stage_dev_res_init,
-                    mlx5_ib_stage_dev_res_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_COUNTERS,
-                    mlx5_ib_stage_counters_init,
-                    mlx5_ib_stage_counters_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_BFREG,
-                    mlx5_ib_stage_bfrag_init,
-                    mlx5_ib_stage_bfrag_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
-                    NULL,
-                    mlx5_ib_stage_pre_ib_reg_umr_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
-                    mlx5_ib_stage_ib_reg_init,
-                    mlx5_ib_stage_ib_reg_cleanup),
-       STAGE_CREATE(MLX5_IB_STAGE_POST_IB_REG_UMR,
-                    mlx5_ib_stage_post_ib_reg_umr_init,
-                    NULL),
-};
+static int
+mlx5_ib_set_vport_rep(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
+{
+       struct mlx5_ib_dev *ibdev;
+       int vport_index;
+
+       ibdev = mlx5_ib_get_uplink_ibdev(dev->priv.eswitch);
+       vport_index = ibdev->free_port++;
+
+       ibdev->port[vport_index].rep = rep;
+       write_lock(&ibdev->port[vport_index].roce.netdev_lock);
+       ibdev->port[vport_index].roce.netdev =
+               mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
+       write_unlock(&ibdev->port[vport_index].roce.netdev_lock);
+
+       return 0;
+}
 
 static int
 mlx5_ib_vport_rep_load(struct mlx5_core_dev *dev, struct mlx5_eswitch_rep *rep)
 {
+       int num_ports = MLX5_TOTAL_VPORTS(dev);
        const struct mlx5_ib_profile *profile;
        struct mlx5_ib_dev *ibdev;
+       int vport_index;
 
        if (rep->vport == MLX5_VPORT_UPLINK)
                profile = &uplink_rep_profile;
        else
-               profile = &vf_rep_profile;
+               return mlx5_ib_set_vport_rep(dev, rep);
 
        ibdev = ib_alloc_device(mlx5_ib_dev, ib_dev);
        if (!ibdev)
                return -ENOMEM;
 
-       ibdev->rep = rep;
-       ibdev->mdev = dev;
-       ibdev->num_ports = max(MLX5_CAP_GEN(dev, num_ports),
-                              MLX5_CAP_GEN(dev, num_vhca_ports));
-       if (!__mlx5_ib_add(ibdev, profile)) {
+       ibdev->port = kcalloc(num_ports, sizeof(*ibdev->port),
+                             GFP_KERNEL);
+       if (!ibdev->port) {
                ib_dealloc_device(&ibdev->ib_dev);
-               return -EINVAL;
+               return -ENOMEM;
        }
 
+       ibdev->is_rep = true;
+       vport_index = ibdev->free_port++;
+       ibdev->port[vport_index].rep = rep;
+       ibdev->port[vport_index].roce.netdev =
+               mlx5_ib_get_rep_netdev(dev->priv.eswitch, rep->vport);
+       ibdev->mdev = dev;
+       ibdev->num_ports = num_ports;
+
+       if (!__mlx5_ib_add(ibdev, profile))
+               return -EINVAL;
+
        rep->rep_if[REP_IB].priv = ibdev;
 
        return 0;
@@ -80,13 +70,13 @@ mlx5_ib_vport_rep_unload(struct mlx5_eswitch_rep *rep)
 {
        struct mlx5_ib_dev *dev;
 
-       if (!rep->rep_if[REP_IB].priv)
+       if (!rep->rep_if[REP_IB].priv ||
+           rep->vport != MLX5_VPORT_UPLINK)
                return;
 
        dev = mlx5_ib_rep_to_dev(rep);
        __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
        rep->rep_if[REP_IB].priv = NULL;
-       ib_dealloc_device(&dev->ib_dev);
 }
 
 static void *mlx5_ib_vport_get_proto_dev(struct mlx5_eswitch_rep *rep)
@@ -140,22 +130,21 @@ struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw, int vport)
        return mlx5_eswitch_vport_rep(esw, vport);
 }
 
-int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-                             struct mlx5_ib_sq *sq)
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+                                                  struct mlx5_ib_sq *sq,
+                                                  u16 port)
 {
-       struct mlx5_flow_handle *flow_rule;
        struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+       struct mlx5_eswitch_rep *rep;
 
-       if (!dev->rep)
-               return 0;
+       if (!dev->is_rep || !port)
+               return NULL;
 
-       flow_rule =
-               mlx5_eswitch_add_send_to_vport_rule(esw,
-                                                   dev->rep->vport,
-                                                   sq->base.mqp.qpn);
-       if (IS_ERR(flow_rule))
-               return PTR_ERR(flow_rule);
-       sq->flow_rule = flow_rule;
+       if (!dev->port[port - 1].rep)
+               return ERR_PTR(-EINVAL);
 
-       return 0;
+       rep = dev->port[port - 1].rep;
+
+       return mlx5_eswitch_add_send_to_vport_rule(esw, rep->vport,
+                                                  sq->base.mqp.qpn);
 }
index 798d41e..1d9778d 100644 (file)
@@ -20,8 +20,9 @@ struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
                                           int vport_index);
 void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev);
 void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev);
-int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-                             struct mlx5_ib_sq *sq);
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+                                                  struct mlx5_ib_sq *sq,
+                                                  u16 port);
 struct net_device *mlx5_ib_get_rep_netdev(struct mlx5_eswitch *esw,
                                          int vport_index);
 #else /* CONFIG_MLX5_ESWITCH */
@@ -52,10 +53,12 @@ struct mlx5_eswitch_rep *mlx5_ib_vport_rep(struct mlx5_eswitch *esw,
 
 static inline void mlx5_ib_register_vport_reps(struct mlx5_core_dev *mdev) {}
 static inline void mlx5_ib_unregister_vport_reps(struct mlx5_core_dev *mdev) {}
-static inline int create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-                                           struct mlx5_ib_sq *sq)
+static inline
+struct mlx5_flow_handle *create_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
+                                                  struct mlx5_ib_sq *sq,
+                                                  u16 port)
 {
-       return 0;
+       return NULL;
 }
 
 static inline
index 1aaa205..abac70a 100644 (file)
@@ -156,6 +156,34 @@ static int get_port_state(struct ib_device *ibdev,
        return ret;
 }
 
+static struct mlx5_roce *mlx5_get_rep_roce(struct mlx5_ib_dev *dev,
+                                          struct net_device *ndev,
+                                          u8 *port_num)
+{
+       struct mlx5_eswitch *esw = dev->mdev->priv.eswitch;
+       struct net_device *rep_ndev;
+       struct mlx5_ib_port *port;
+       int i;
+
+       for (i = 0; i < dev->num_ports; i++) {
+               port  = &dev->port[i];
+               if (!port->rep)
+                       continue;
+
+               read_lock(&port->roce.netdev_lock);
+               rep_ndev = mlx5_ib_get_rep_netdev(esw,
+                                                 port->rep->vport);
+               if (rep_ndev == ndev) {
+                       read_unlock(&port->roce.netdev_lock);
+                       *port_num = i + 1;
+                       return &port->roce;
+               }
+               read_unlock(&port->roce.netdev_lock);
+       }
+
+       return NULL;
+}
+
 static int mlx5_netdev_event(struct notifier_block *this,
                             unsigned long event, void *ptr)
 {
@@ -172,22 +200,17 @@ static int mlx5_netdev_event(struct notifier_block *this,
 
        switch (event) {
        case NETDEV_REGISTER:
+               /* Should already be registered during the load */
+               if (ibdev->is_rep)
+                       break;
                write_lock(&roce->netdev_lock);
-               if (ibdev->rep) {
-                       struct mlx5_eswitch *esw = ibdev->mdev->priv.eswitch;
-                       struct net_device *rep_ndev;
-
-                       rep_ndev = mlx5_ib_get_rep_netdev(esw,
-                                                         ibdev->rep->vport);
-                       if (rep_ndev == ndev)
-                               roce->netdev = ndev;
-               } else if (ndev->dev.parent == mdev->device) {
+               if (ndev->dev.parent == mdev->device)
                        roce->netdev = ndev;
-               }
                write_unlock(&roce->netdev_lock);
                break;
 
        case NETDEV_UNREGISTER:
+               /* In case of reps, ib device goes away before the netdevs */
                write_lock(&roce->netdev_lock);
                if (roce->netdev == ndev)
                        roce->netdev = NULL;
@@ -205,6 +228,10 @@ static int mlx5_netdev_event(struct notifier_block *this,
                        dev_put(lag_ndev);
                }
 
+               if (ibdev->is_rep)
+                       roce = mlx5_get_rep_roce(ibdev, ndev, &port_num);
+               if (!roce)
+                       return NOTIFY_DONE;
                if ((upper == ndev || (!upper && ndev == roce->netdev))
                    && ibdev->ib_active) {
                        struct ib_event ibev = { };
@@ -257,11 +284,11 @@ static struct net_device *mlx5_ib_get_netdev(struct ib_device *device,
 
        /* Ensure ndev does not disappear before we invoke dev_hold()
         */
-       read_lock(&ibdev->roce[port_num - 1].netdev_lock);
-       ndev = ibdev->roce[port_num - 1].netdev;
+       read_lock(&ibdev->port[port_num - 1].roce.netdev_lock);
+       ndev = ibdev->port[port_num - 1].roce.netdev;
        if (ndev)
                dev_hold(ndev);
-       read_unlock(&ibdev->roce[port_num - 1].netdev_lock);
+       read_unlock(&ibdev->port[port_num - 1].roce.netdev_lock);
 
 out:
        mlx5_ib_put_native_port_mdev(ibdev, port_num);
@@ -479,9 +506,14 @@ static int mlx5_query_port_roce(struct ib_device *device, u8 port_num,
 
        /* Possible bad flows are checked before filling out props so in case
         * of an error it will still be zeroed out.
+        * Use native port in case of reps
         */
-       err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
-                                  mdev_port_num);
+       if (dev->is_rep)
+               err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
+                                          1);
+       else
+               err = mlx5_query_port_ptys(mdev, out, sizeof(out), MLX5_PTYS_EN,
+                                          mdev_port_num);
        if (err)
                goto out;
        ext = MLX5_CAP_PCAM_FEATURE(dev->mdev, ptys_extended_ethernet);
@@ -542,52 +574,22 @@ out:
        return err;
 }
 
-struct mlx5_ib_vlan_info {
-       u16 vlan_id;
-       bool vlan;
-};
-
-static int get_lower_dev_vlan(struct net_device *lower_dev, void *data)
-{
-       struct mlx5_ib_vlan_info *vlan_info = data;
-
-       if (is_vlan_dev(lower_dev)) {
-               vlan_info->vlan = true;
-               vlan_info->vlan_id = vlan_dev_vlan_id(lower_dev);
-       }
-       /* We are interested only in first level vlan device, so
-        * always return 1 to stop iterating over next level devices.
-        */
-       return 1;
-}
-
 static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
                         unsigned int index, const union ib_gid *gid,
                         const struct ib_gid_attr *attr)
 {
        enum ib_gid_type gid_type = IB_GID_TYPE_IB;
-       struct mlx5_ib_vlan_info vlan_info = { };
+       u16 vlan_id = 0xffff;
        u8 roce_version = 0;
        u8 roce_l3_type = 0;
        u8 mac[ETH_ALEN];
+       int ret;
 
        if (gid) {
                gid_type = attr->gid_type;
-               ether_addr_copy(mac, attr->ndev->dev_addr);
-
-               if (is_vlan_dev(attr->ndev)) {
-                       vlan_info.vlan = true;
-                       vlan_info.vlan_id = vlan_dev_vlan_id(attr->ndev);
-               } else {
-                       /* If the netdev is upper device and if it's lower
-                        * lower device is vlan device, consider vlan id of
-                        * the lower vlan device for this gid entry.
-                        */
-                       rcu_read_lock();
-                       netdev_walk_all_lower_dev_rcu(attr->ndev,
-                                       get_lower_dev_vlan, &vlan_info);
-                       rcu_read_unlock();
-               }
+               ret = rdma_read_gid_l2_fields(attr, &vlan_id, &mac[0]);
+               if (ret)
+                       return ret;
        }
 
        switch (gid_type) {
@@ -608,7 +610,7 @@ static int set_roce_addr(struct mlx5_ib_dev *dev, u8 port_num,
 
        return mlx5_core_roce_gid_set(dev->mdev, index, roce_version,
                                      roce_l3_type, gid->raw, mac,
-                                     vlan_info.vlan, vlan_info.vlan_id,
+                                     vlan_id < VLAN_CFI_MASK, vlan_id,
                                      port_num);
 }
 
@@ -1407,7 +1409,9 @@ static int mlx5_ib_rep_query_port(struct ib_device *ibdev, u8 port,
 {
        int ret;
 
-       /* Only link layer == ethernet is valid for representors */
+       /* Only link layer == ethernet is valid for representors
+        * and we always use port 1
+        */
        ret = mlx5_query_port_roce(ibdev, port, props);
        if (ret || !props)
                return ret;
@@ -1954,11 +1958,11 @@ static int mlx5_ib_alloc_ucontext(struct ib_ucontext *uctx,
        print_lib_caps(dev, context->lib_caps);
 
        if (dev->lag_active) {
-               u8 port = mlx5_core_native_port_num(dev->mdev);
+               u8 port = mlx5_core_native_port_num(dev->mdev) - 1;
 
                atomic_set(&context->tx_port_affinity,
                           atomic_add_return(
-                                  1, &dev->roce[port].tx_port_affinity));
+                                  1, &dev->port[port].roce.tx_port_affinity));
        }
 
        return 0;
@@ -2060,21 +2064,22 @@ static int mlx5_ib_mmap_clock_info_page(struct mlx5_ib_dev *dev,
                                        struct vm_area_struct *vma,
                                        struct mlx5_ib_ucontext *context)
 {
-       if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+       if ((vma->vm_end - vma->vm_start != PAGE_SIZE) ||
+           !(vma->vm_flags & VM_SHARED))
                return -EINVAL;
 
        if (get_index(vma->vm_pgoff) != MLX5_IB_CLOCK_INFO_V1)
                return -EOPNOTSUPP;
 
-       if (vma->vm_flags & VM_WRITE)
+       if (vma->vm_flags & (VM_WRITE | VM_EXEC))
                return -EPERM;
        vma->vm_flags &= ~VM_MAYWRITE;
 
-       if (!dev->mdev->clock_info_page)
+       if (!dev->mdev->clock_info)
                return -EOPNOTSUPP;
 
-       return rdma_user_mmap_page(&context->ibucontext, vma,
-                                  dev->mdev->clock_info_page, PAGE_SIZE);
+       return vm_insert_page(vma, vma->vm_start,
+                             virt_to_page(dev->mdev->clock_info));
 }
 
 static int uar_mmap(struct mlx5_ib_dev *dev, enum mlx5_ib_mmap_cmd cmd,
@@ -2259,89 +2264,200 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm
        return 0;
 }
 
-struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
-                              struct ib_ucontext *context,
-                              struct ib_dm_alloc_attr *attr,
-                              struct uverbs_attr_bundle *attrs)
+static inline int check_dm_type_support(struct mlx5_ib_dev *dev,
+                                       u32 type)
 {
-       u64 act_size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
-       struct mlx5_memic *memic = &to_mdev(ibdev)->memic;
-       phys_addr_t memic_addr;
-       struct mlx5_ib_dm *dm;
+       switch (type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               if (!MLX5_CAP_DEV_MEM(dev->mdev, memic))
+                       return -EOPNOTSUPP;
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+               if (!capable(CAP_SYS_RAWIO) ||
+                   !capable(CAP_NET_RAW))
+                       return -EPERM;
+
+               if (!(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner) ||
+                     MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, sw_owner)))
+                       return -EOPNOTSUPP;
+               break;
+       }
+
+       return 0;
+}
+
+static int handle_alloc_dm_memic(struct ib_ucontext *ctx,
+                                struct mlx5_ib_dm *dm,
+                                struct ib_dm_alloc_attr *attr,
+                                struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
        u64 start_offset;
        u32 page_idx;
        int err;
 
-       dm = kzalloc(sizeof(*dm), GFP_KERNEL);
-       if (!dm)
-               return ERR_PTR(-ENOMEM);
-
-       mlx5_ib_dbg(to_mdev(ibdev), "alloc_memic req: user_length=0x%llx act_length=0x%llx log_alignment=%d\n",
-                   attr->length, act_size, attr->alignment);
+       dm->size = roundup(attr->length, MLX5_MEMIC_BASE_SIZE);
 
-       err = mlx5_cmd_alloc_memic(memic, &memic_addr,
-                                  act_size, attr->alignment);
+       err = mlx5_cmd_alloc_memic(dm_db, &dm->dev_addr,
+                                  dm->size, attr->alignment);
        if (err)
-               goto err_free;
+               return err;
 
-       start_offset = memic_addr & ~PAGE_MASK;
-       page_idx = (memic_addr - memic->dev->bar_addr -
-                   MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
+       page_idx = (dm->dev_addr - pci_resource_start(dm_db->dev->pdev, 0) -
+                   MLX5_CAP64_DEV_MEM(dm_db->dev, memic_bar_start_addr)) >>
                    PAGE_SHIFT;
 
+       err = uverbs_copy_to(attrs,
+                            MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+                            &page_idx, sizeof(page_idx));
+       if (err)
+               goto err_dealloc;
+
+       start_offset = dm->dev_addr & ~PAGE_MASK;
        err = uverbs_copy_to(attrs,
                             MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
                             &start_offset, sizeof(start_offset));
        if (err)
                goto err_dealloc;
 
+       bitmap_set(to_mucontext(ctx)->dm_pages, page_idx,
+                  DIV_ROUND_UP(dm->size, PAGE_SIZE));
+
+       return 0;
+
+err_dealloc:
+       mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
+
+       return err;
+}
+
+static int handle_alloc_dm_sw_icm(struct ib_ucontext *ctx,
+                                 struct mlx5_ib_dm *dm,
+                                 struct ib_dm_alloc_attr *attr,
+                                 struct uverbs_attr_bundle *attrs,
+                                 int type)
+{
+       struct mlx5_dm *dm_db = &to_mdev(ctx->device)->dm;
+       u64 act_size;
+       int err;
+
+       /* Allocation size must a multiple of the basic block size
+        * and a power of 2.
+        */
+       act_size = roundup(attr->length, MLX5_SW_ICM_BLOCK_SIZE(dm_db->dev));
+       act_size = roundup_pow_of_two(act_size);
+
+       dm->size = act_size;
+       err = mlx5_cmd_alloc_sw_icm(dm_db, type, act_size,
+                                   to_mucontext(ctx)->devx_uid, &dm->dev_addr,
+                                   &dm->icm_dm.obj_id);
+       if (err)
+               return err;
+
        err = uverbs_copy_to(attrs,
-                            MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
-                            &page_idx, sizeof(page_idx));
+                            MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET,
+                            &dm->dev_addr, sizeof(dm->dev_addr));
        if (err)
-               goto err_dealloc;
+               mlx5_cmd_dealloc_sw_icm(dm_db, type, dm->size,
+                                       to_mucontext(ctx)->devx_uid,
+                                       dm->dev_addr, dm->icm_dm.obj_id);
+
+       return err;
+}
+
+struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
+                              struct ib_ucontext *context,
+                              struct ib_dm_alloc_attr *attr,
+                              struct uverbs_attr_bundle *attrs)
+{
+       struct mlx5_ib_dm *dm;
+       enum mlx5_ib_uapi_dm_type type;
+       int err;
 
-       bitmap_set(to_mucontext(context)->dm_pages, page_idx,
-                  DIV_ROUND_UP(act_size, PAGE_SIZE));
+       err = uverbs_get_const_default(&type, attrs,
+                                      MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+                                      MLX5_IB_UAPI_DM_TYPE_MEMIC);
+       if (err)
+               return ERR_PTR(err);
 
-       dm->dev_addr = memic_addr;
+       mlx5_ib_dbg(to_mdev(ibdev), "alloc_dm req: dm_type=%d user_length=0x%llx log_alignment=%d\n",
+                   type, attr->length, attr->alignment);
+
+       err = check_dm_type_support(to_mdev(ibdev), type);
+       if (err)
+               return ERR_PTR(err);
+
+       dm = kzalloc(sizeof(*dm), GFP_KERNEL);
+       if (!dm)
+               return ERR_PTR(-ENOMEM);
+
+       dm->type = type;
+
+       switch (type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               err = handle_alloc_dm_memic(context, dm,
+                                           attr,
+                                           attrs);
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               err = handle_alloc_dm_sw_icm(context, dm, attr, attrs, type);
+               break;
+       default:
+               err = -EOPNOTSUPP;
+       }
+
+       if (err)
+               goto err_free;
 
        return &dm->ibdm;
 
-err_dealloc:
-       mlx5_cmd_dealloc_memic(memic, memic_addr,
-                              act_size);
 err_free:
        kfree(dm);
        return ERR_PTR(err);
 }
 
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm)
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs)
 {
-       struct mlx5_memic *memic = &to_mdev(ibdm->device)->memic;
+       struct mlx5_ib_ucontext *ctx = rdma_udata_to_drv_context(
+               &attrs->driver_udata, struct mlx5_ib_ucontext, ibucontext);
+       struct mlx5_dm *dm_db = &to_mdev(ibdm->device)->dm;
        struct mlx5_ib_dm *dm = to_mdm(ibdm);
-       u64 act_size = roundup(dm->ibdm.length, MLX5_MEMIC_BASE_SIZE);
        u32 page_idx;
        int ret;
 
-       ret = mlx5_cmd_dealloc_memic(memic, dm->dev_addr, act_size);
-       if (ret)
-               return ret;
+       switch (dm->type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               ret = mlx5_cmd_dealloc_memic(dm_db, dm->dev_addr, dm->size);
+               if (ret)
+                       return ret;
 
-       page_idx = (dm->dev_addr - memic->dev->bar_addr -
-                   MLX5_CAP64_DEV_MEM(memic->dev, memic_bar_start_addr)) >>
-                   PAGE_SHIFT;
-       bitmap_clear(to_mucontext(ibdm->uobject->context)->dm_pages,
-                    page_idx,
-                    DIV_ROUND_UP(act_size, PAGE_SIZE));
+               page_idx = (dm->dev_addr -
+                           pci_resource_start(dm_db->dev->pdev, 0) -
+                           MLX5_CAP64_DEV_MEM(dm_db->dev,
+                                              memic_bar_start_addr)) >>
+                          PAGE_SHIFT;
+               bitmap_clear(ctx->dm_pages, page_idx,
+                            DIV_ROUND_UP(dm->size, PAGE_SIZE));
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               ret = mlx5_cmd_dealloc_sw_icm(dm_db, dm->type, dm->size,
+                                             ctx->devx_uid, dm->dev_addr,
+                                             dm->icm_dm.obj_id);
+               if (ret)
+                       return ret;
+               break;
+       default:
+               return -EOPNOTSUPP;
+       }
 
        kfree(dm);
 
        return 0;
 }
 
-static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                           struct ib_udata *udata)
+static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct mlx5_ib_pd *pd = to_mpd(ibpd);
        struct ib_device *ibdev = ibpd->device;
@@ -2350,8 +2466,10 @@ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        u32 out[MLX5_ST_SZ_DW(alloc_pd_out)] = {};
        u32 in[MLX5_ST_SZ_DW(alloc_pd_in)]   = {};
        u16 uid = 0;
+       struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
 
-       uid = context ? to_mucontext(context)->devx_uid : 0;
+       uid = context ? context->devx_uid : 0;
        MLX5_SET(alloc_pd_in, in, opcode, MLX5_CMD_OP_ALLOC_PD);
        MLX5_SET(alloc_pd_in, in, uid, uid);
        err = mlx5_cmd_exec(to_mdev(ibdev)->mdev, in, sizeof(in),
@@ -2361,7 +2479,7 @@ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
 
        pd->pdn = MLX5_GET(alloc_pd_out, out, pd);
        pd->uid = uid;
-       if (context) {
+       if (udata) {
                resp.pdn = pd->pdn;
                if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
                        mlx5_cmd_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn, uid);
@@ -2372,7 +2490,7 @@ static int mlx5_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        return 0;
 }
 
-static void mlx5_ib_dealloc_pd(struct ib_pd *pd)
+static void mlx5_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *mdev = to_mdev(pd->device);
        struct mlx5_ib_pd *mpd = to_mpd(pd);
@@ -3151,10 +3269,10 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
                if (ft_type == MLX5_IB_FT_RX) {
                        fn_type = MLX5_FLOW_NAMESPACE_BYPASS;
                        prio = &dev->flow_db->prios[priority];
-                       if (!dev->rep &&
+                       if (!dev->is_rep &&
                            MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, decap))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_DECAP;
-                       if (!dev->rep &&
+                       if (!dev->is_rep &&
                            MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                        reformat_l3_tunnel_to_l2))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
@@ -3164,7 +3282,7 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
                                                              log_max_ft_size));
                        fn_type = MLX5_FLOW_NAMESPACE_EGRESS;
                        prio = &dev->flow_db->egress_prios[priority];
-                       if (!dev->rep &&
+                       if (!dev->is_rep &&
                            MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
                                flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
                }
@@ -3197,12 +3315,11 @@ static struct mlx5_ib_flow_prio *get_flow_table(struct mlx5_ib_dev *dev,
        if (!ns)
                return ERR_PTR(-ENOTSUPP);
 
-       if (num_entries > max_table_size)
-               return ERR_PTR(-ENOMEM);
+       max_table_size = min_t(int, num_entries, max_table_size);
 
        ft = prio->flow_table;
        if (!ft)
-               return _get_prio(ns, prio, priority, num_entries, num_groups,
+               return _get_prio(ns, prio, priority, max_table_size, num_groups,
                                 flags);
 
        return prio;
@@ -3370,7 +3487,7 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
        if (!is_valid_attr(dev->mdev, flow_attr))
                return ERR_PTR(-EINVAL);
 
-       if (dev->rep && is_egress)
+       if (dev->is_rep && is_egress)
                return ERR_PTR(-EINVAL);
 
        spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
@@ -3401,13 +3518,17 @@ static struct mlx5_ib_flow_handler *_create_flow_rule(struct mlx5_ib_dev *dev,
        if (!flow_is_multicast_only(flow_attr))
                set_underlay_qp(dev, spec, underlay_qpn);
 
-       if (dev->rep) {
+       if (dev->is_rep) {
                void *misc;
 
+               if (!dev->port[flow_attr->port - 1].rep) {
+                       err = -EINVAL;
+                       goto free;
+               }
                misc = MLX5_ADDR_OF(fte_match_param, spec->match_value,
                                    misc_parameters);
                MLX5_SET(fte_match_set_misc, misc, source_port,
-                        dev->rep->vport);
+                        dev->port[flow_attr->port - 1].rep->vport);
                misc = MLX5_ADDR_OF(fte_match_param, spec->match_criteria,
                                    misc_parameters);
                MLX5_SET_TO_ONES(fte_match_set_misc, misc, source_port);
@@ -3769,11 +3890,16 @@ _get_flow_table(struct mlx5_ib_dev *dev,
                bool mcast)
 {
        struct mlx5_flow_namespace *ns = NULL;
-       struct mlx5_ib_flow_prio *prio;
-       int max_table_size;
+       struct mlx5_ib_flow_prio *prio = NULL;
+       int max_table_size = 0;
        u32 flags = 0;
        int priority;
 
+       if (mcast)
+               priority = MLX5_IB_FLOW_MCAST_PRIO;
+       else
+               priority = ib_prio_to_core_prio(fs_matcher->priority, false);
+
        if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS) {
                max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                        log_max_ft_size));
@@ -3782,20 +3908,18 @@ _get_flow_table(struct mlx5_ib_dev *dev,
                if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev,
                                              reformat_l3_tunnel_to_l2))
                        flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
-       } else { /* Can only be MLX5_FLOW_NAMESPACE_EGRESS */
-               max_table_size = BIT(MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev,
-                                       log_max_ft_size));
+       } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS) {
+               max_table_size = BIT(
+                       MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, log_max_ft_size));
                if (MLX5_CAP_FLOWTABLE_NIC_TX(dev->mdev, reformat))
                        flags |= MLX5_FLOW_TABLE_TUNNEL_EN_REFORMAT;
+       } else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB) {
+               max_table_size = BIT(
+                       MLX5_CAP_ESW_FLOWTABLE_FDB(dev->mdev, log_max_ft_size));
+               priority = FDB_BYPASS_PATH;
        }
 
-       if (max_table_size < MLX5_FS_MAX_ENTRIES)
-               return ERR_PTR(-ENOMEM);
-
-       if (mcast)
-               priority = MLX5_IB_FLOW_MCAST_PRIO;
-       else
-               priority = ib_prio_to_core_prio(fs_matcher->priority, false);
+       max_table_size = min_t(int, max_table_size, MLX5_FS_MAX_ENTRIES);
 
        ns = mlx5_get_flow_namespace(dev->mdev, fs_matcher->ns_type);
        if (!ns)
@@ -3803,13 +3927,18 @@ _get_flow_table(struct mlx5_ib_dev *dev,
 
        if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_BYPASS)
                prio = &dev->flow_db->prios[priority];
-       else
+       else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_EGRESS)
                prio = &dev->flow_db->egress_prios[priority];
+       else if (fs_matcher->ns_type == MLX5_FLOW_NAMESPACE_FDB)
+               prio = &dev->flow_db->fdb;
+
+       if (!prio)
+               return ERR_PTR(-EINVAL);
 
        if (prio->flow_table)
                return prio;
 
-       return _get_prio(ns, prio, priority, MLX5_FS_MAX_ENTRIES,
+       return _get_prio(ns, prio, priority, max_table_size,
                         MLX5_FS_MAX_TYPES, flags);
 }
 
@@ -4509,7 +4638,7 @@ static int set_has_smi_cap(struct mlx5_ib_dev *dev)
        int err;
        int port;
 
-       for (port = 1; port <= dev->num_ports; port++) {
+       for (port = 1; port <= ARRAY_SIZE(dev->mdev->port_caps); port++) {
                dev->mdev->port_caps[port - 1].has_smi = false;
                if (MLX5_CAP_GEN(dev->mdev, port_type) ==
                    MLX5_CAP_PORT_TYPE_IB) {
@@ -4540,7 +4669,7 @@ static void get_ext_port_caps(struct mlx5_ib_dev *dev)
                mlx5_query_ext_port_caps(dev, port);
 }
 
-static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
+static int __get_port_caps(struct mlx5_ib_dev *dev, u8 port)
 {
        struct ib_device_attr *dprops = NULL;
        struct ib_port_attr *pprops = NULL;
@@ -4555,10 +4684,6 @@ static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
        if (!dprops)
                goto out;
 
-       err = set_has_smi_cap(dev);
-       if (err)
-               goto out;
-
        err = mlx5_ib_query_device(&dev->ib_dev, dprops, &uhw);
        if (err) {
                mlx5_ib_warn(dev, "query_device failed %d\n", err);
@@ -4587,6 +4712,16 @@ out:
        return err;
 }
 
+static int get_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+       /* For representors use port 1, is this is the only native
+        * port
+        */
+       if (dev->is_rep)
+               return __get_port_caps(dev, 1);
+       return __get_port_caps(dev, port);
+}
+
 static void destroy_umrc_res(struct mlx5_ib_dev *dev)
 {
        int err;
@@ -4596,7 +4731,7 @@ static void destroy_umrc_res(struct mlx5_ib_dev *dev)
                mlx5_ib_warn(dev, "mr cache cleanup failed\n");
 
        if (dev->umrc.qp)
-               mlx5_ib_destroy_qp(dev->umrc.qp);
+               mlx5_ib_destroy_qp(dev->umrc.qp, NULL);
        if (dev->umrc.cq)
                ib_free_cq(dev->umrc.cq);
        if (dev->umrc.pd)
@@ -4701,7 +4836,7 @@ static int create_umr_res(struct mlx5_ib_dev *dev)
        return 0;
 
 error_4:
-       mlx5_ib_destroy_qp(qp);
+       mlx5_ib_destroy_qp(qp, NULL);
        dev->umrc.qp = NULL;
 
 error_3:
@@ -4752,11 +4887,11 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        devr->p0->uobject = NULL;
        atomic_set(&devr->p0->usecnt, 0);
 
-       ret = mlx5_ib_alloc_pd(devr->p0, NULL, NULL);
+       ret = mlx5_ib_alloc_pd(devr->p0, NULL);
        if (ret)
                goto error0;
 
-       devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL, NULL);
+       devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, &cq_attr, NULL);
        if (IS_ERR(devr->c0)) {
                ret = PTR_ERR(devr->c0);
                goto error1;
@@ -4768,7 +4903,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        devr->c0->cq_context    = NULL;
        atomic_set(&devr->c0->usecnt, 0);
 
-       devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+       devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
        if (IS_ERR(devr->x0)) {
                ret = PTR_ERR(devr->x0);
                goto error2;
@@ -4779,7 +4914,7 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        mutex_init(&devr->x0->tgt_qp_mutex);
        INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
 
-       devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+       devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL);
        if (IS_ERR(devr->x1)) {
                ret = PTR_ERR(devr->x1);
                goto error3;
@@ -4797,19 +4932,21 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        attr.ext.cq = devr->c0;
        attr.ext.xrc.xrcd = devr->x0;
 
-       devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
-       if (IS_ERR(devr->s0)) {
-               ret = PTR_ERR(devr->s0);
+       devr->s0 = rdma_zalloc_drv_obj(ibdev, ib_srq);
+       if (!devr->s0) {
+               ret = -ENOMEM;
                goto error4;
        }
+
        devr->s0->device        = &dev->ib_dev;
        devr->s0->pd            = devr->p0;
-       devr->s0->uobject       = NULL;
-       devr->s0->event_handler = NULL;
-       devr->s0->srq_context   = NULL;
        devr->s0->srq_type      = IB_SRQT_XRC;
        devr->s0->ext.xrc.xrcd  = devr->x0;
        devr->s0->ext.cq        = devr->c0;
+       ret = mlx5_ib_create_srq(devr->s0, &attr, NULL);
+       if (ret)
+               goto err_create;
+
        atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
        atomic_inc(&devr->s0->ext.cq->usecnt);
        atomic_inc(&devr->p0->usecnt);
@@ -4819,18 +4956,21 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
        attr.attr.max_sge = 1;
        attr.attr.max_wr = 1;
        attr.srq_type = IB_SRQT_BASIC;
-       devr->s1 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
-       if (IS_ERR(devr->s1)) {
-               ret = PTR_ERR(devr->s1);
+       devr->s1 = rdma_zalloc_drv_obj(ibdev, ib_srq);
+       if (!devr->s1) {
+               ret = -ENOMEM;
                goto error5;
        }
+
        devr->s1->device        = &dev->ib_dev;
        devr->s1->pd            = devr->p0;
-       devr->s1->uobject       = NULL;
-       devr->s1->event_handler = NULL;
-       devr->s1->srq_context   = NULL;
        devr->s1->srq_type      = IB_SRQT_BASIC;
        devr->s1->ext.cq        = devr->c0;
+
+       ret = mlx5_ib_create_srq(devr->s1, &attr, NULL);
+       if (ret)
+               goto error6;
+
        atomic_inc(&devr->p0->usecnt);
        atomic_set(&devr->s1->usecnt, 0);
 
@@ -4842,16 +4982,20 @@ static int create_dev_resources(struct mlx5_ib_resources *devr)
 
        return 0;
 
+error6:
+       kfree(devr->s1);
 error5:
-       mlx5_ib_destroy_srq(devr->s0);
+       mlx5_ib_destroy_srq(devr->s0, NULL);
+err_create:
+       kfree(devr->s0);
 error4:
-       mlx5_ib_dealloc_xrcd(devr->x1);
+       mlx5_ib_dealloc_xrcd(devr->x1, NULL);
 error3:
-       mlx5_ib_dealloc_xrcd(devr->x0);
+       mlx5_ib_dealloc_xrcd(devr->x0, NULL);
 error2:
-       mlx5_ib_destroy_cq(devr->c0);
+       mlx5_ib_destroy_cq(devr->c0, NULL);
 error1:
-       mlx5_ib_dealloc_pd(devr->p0);
+       mlx5_ib_dealloc_pd(devr->p0, NULL);
 error0:
        kfree(devr->p0);
        return ret;
@@ -4859,20 +5003,20 @@ error0:
 
 static void destroy_dev_resources(struct mlx5_ib_resources *devr)
 {
-       struct mlx5_ib_dev *dev =
-               container_of(devr, struct mlx5_ib_dev, devr);
        int port;
 
-       mlx5_ib_destroy_srq(devr->s1);
-       mlx5_ib_destroy_srq(devr->s0);
-       mlx5_ib_dealloc_xrcd(devr->x0);
-       mlx5_ib_dealloc_xrcd(devr->x1);
-       mlx5_ib_destroy_cq(devr->c0);
-       mlx5_ib_dealloc_pd(devr->p0);
+       mlx5_ib_destroy_srq(devr->s1, NULL);
+       kfree(devr->s1);
+       mlx5_ib_destroy_srq(devr->s0, NULL);
+       kfree(devr->s0);
+       mlx5_ib_dealloc_xrcd(devr->x0, NULL);
+       mlx5_ib_dealloc_xrcd(devr->x1, NULL);
+       mlx5_ib_destroy_cq(devr->c0, NULL);
+       mlx5_ib_dealloc_pd(devr->p0, NULL);
        kfree(devr->p0);
 
        /* Make sure no change P_Key work items are still executing */
-       for (port = 0; port < dev->num_ports; ++port)
+       for (port = 0; port < ARRAY_SIZE(devr->ports); ++port)
                cancel_work_sync(&devr->ports[port].pkey_change_work);
 }
 
@@ -5015,10 +5159,10 @@ static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
        int err;
 
-       dev->roce[port_num].nb.notifier_call = mlx5_netdev_event;
-       err = register_netdevice_notifier(&dev->roce[port_num].nb);
+       dev->port[port_num].roce.nb.notifier_call = mlx5_netdev_event;
+       err = register_netdevice_notifier(&dev->port[port_num].roce.nb);
        if (err) {
-               dev->roce[port_num].nb.notifier_call = NULL;
+               dev->port[port_num].roce.nb.notifier_call = NULL;
                return err;
        }
 
@@ -5027,9 +5171,9 @@ static int mlx5_add_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 
 static void mlx5_remove_netdev_notifier(struct mlx5_ib_dev *dev, u8 port_num)
 {
-       if (dev->roce[port_num].nb.notifier_call) {
-               unregister_netdevice_notifier(&dev->roce[port_num].nb);
-               dev->roce[port_num].nb.notifier_call = NULL;
+       if (dev->port[port_num].roce.nb.notifier_call) {
+               unregister_netdevice_notifier(&dev->port[port_num].roce.nb);
+               dev->port[port_num].roce.nb.notifier_call = NULL;
        }
 }
 
@@ -5578,7 +5722,7 @@ static void mlx5_ib_unbind_slave_port(struct mlx5_ib_dev *ibdev,
                mlx5_ib_err(ibdev, "Failed to unaffiliate port %u\n",
                            port_num + 1);
 
-       ibdev->roce[port_num].last_port_state = IB_PORT_DOWN;
+       ibdev->port[port_num].roce.last_port_state = IB_PORT_DOWN;
 }
 
 /* The mlx5_ib_multiport_mutex should be held when calling this function */
@@ -5738,7 +5882,10 @@ ADD_UVERBS_ATTRIBUTES_SIMPLE(
                            UA_MANDATORY),
        UVERBS_ATTR_PTR_OUT(MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
                            UVERBS_ATTR_TYPE(u16),
-                           UA_MANDATORY));
+                           UA_OPTIONAL),
+       UVERBS_ATTR_CONST_IN(MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
+                            enum mlx5_ib_uapi_dm_type,
+                            UA_OPTIONAL));
 
 ADD_UVERBS_ATTRIBUTES_SIMPLE(
        mlx5_ib_flow_action,
@@ -5829,35 +5976,58 @@ static struct ib_counters *mlx5_ib_create_counters(struct ib_device *device,
        return &mcounters->ibcntrs;
 }
 
-void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev)
 {
+       struct mlx5_core_dev *mdev = dev->mdev;
+
        mlx5_ib_cleanup_multiport_master(dev);
        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                srcu_barrier(&dev->mr_srcu);
                cleanup_srcu_struct(&dev->mr_srcu);
        }
-       kfree(dev->port);
+
+       WARN_ON(!bitmap_empty(dev->dm.memic_alloc_pages, MLX5_MAX_MEMIC_PAGES));
+
+       WARN_ON(dev->dm.steering_sw_icm_alloc_blocks &&
+               !bitmap_empty(
+                       dev->dm.steering_sw_icm_alloc_blocks,
+                       BIT(MLX5_CAP_DEV_MEM(mdev, log_steering_sw_icm_size) -
+                           MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
+
+       kfree(dev->dm.steering_sw_icm_alloc_blocks);
+
+       WARN_ON(dev->dm.header_modify_sw_icm_alloc_blocks &&
+               !bitmap_empty(dev->dm.header_modify_sw_icm_alloc_blocks,
+                             BIT(MLX5_CAP_DEV_MEM(
+                                         mdev, log_header_modify_sw_icm_size) -
+                                 MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev))));
+
+       kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
 }
 
-int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
 {
        struct mlx5_core_dev *mdev = dev->mdev;
+       u64 header_modify_icm_blocks = 0;
+       u64 steering_icm_blocks = 0;
        int err;
        int i;
 
-       dev->port = kcalloc(dev->num_ports, sizeof(*dev->port),
-                           GFP_KERNEL);
-       if (!dev->port)
-               return -ENOMEM;
-
        for (i = 0; i < dev->num_ports; i++) {
                spin_lock_init(&dev->port[i].mp.mpi_lock);
-               rwlock_init(&dev->roce[i].netdev_lock);
+               rwlock_init(&dev->port[i].roce.netdev_lock);
+               dev->port[i].roce.dev = dev;
+               dev->port[i].roce.native_port_num = i + 1;
+               dev->port[i].roce.last_port_state = IB_PORT_DOWN;
        }
 
        err = mlx5_ib_init_multiport_master(dev);
        if (err)
-               goto err_free_port;
+               return err;
+
+       err = set_has_smi_cap(dev);
+       if (err)
+               return err;
 
        if (!mlx5_core_mp_enabled(mdev)) {
                for (i = 1; i <= dev->num_ports; i++) {
@@ -5885,22 +6055,54 @@ int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev)
        INIT_LIST_HEAD(&dev->qp_list);
        spin_lock_init(&dev->reset_flow_resource_lock);
 
-       spin_lock_init(&dev->memic.memic_lock);
-       dev->memic.dev = mdev;
+       if (MLX5_CAP_GEN_64(mdev, general_obj_types) &
+           MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM) {
+               if (MLX5_CAP64_DEV_MEM(mdev, steering_sw_icm_start_address)) {
+                       steering_icm_blocks =
+                               BIT(MLX5_CAP_DEV_MEM(mdev,
+                                                    log_steering_sw_icm_size) -
+                                   MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
+
+                       dev->dm.steering_sw_icm_alloc_blocks =
+                               kcalloc(BITS_TO_LONGS(steering_icm_blocks),
+                                       sizeof(unsigned long), GFP_KERNEL);
+                       if (!dev->dm.steering_sw_icm_alloc_blocks)
+                               goto err_mp;
+               }
+
+               if (MLX5_CAP64_DEV_MEM(mdev,
+                                      header_modify_sw_icm_start_address)) {
+                       header_modify_icm_blocks = BIT(
+                               MLX5_CAP_DEV_MEM(
+                                       mdev, log_header_modify_sw_icm_size) -
+                               MLX5_LOG_SW_ICM_BLOCK_SIZE(mdev));
+
+                       dev->dm.header_modify_sw_icm_alloc_blocks =
+                               kcalloc(BITS_TO_LONGS(header_modify_icm_blocks),
+                                       sizeof(unsigned long), GFP_KERNEL);
+                       if (!dev->dm.header_modify_sw_icm_alloc_blocks)
+                               goto err_dm;
+               }
+       }
+
+       spin_lock_init(&dev->dm.lock);
+       dev->dm.dev = mdev;
 
        if (IS_ENABLED(CONFIG_INFINIBAND_ON_DEMAND_PAGING)) {
                err = init_srcu_struct(&dev->mr_srcu);
                if (err)
-                       goto err_mp;
+                       goto err_dm;
        }
 
        return 0;
+
+err_dm:
+       kfree(dev->dm.steering_sw_icm_alloc_blocks);
+       kfree(dev->dm.header_modify_sw_icm_alloc_blocks);
+
 err_mp:
        mlx5_ib_cleanup_multiport_master(dev);
 
-err_free_port:
-       kfree(dev->port);
-
        return -ENOMEM;
 }
 
@@ -5916,20 +6118,6 @@ static int mlx5_ib_stage_flow_db_init(struct mlx5_ib_dev *dev)
        return 0;
 }
 
-int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev)
-{
-       struct mlx5_ib_dev *nic_dev;
-
-       nic_dev = mlx5_ib_get_uplink_ibdev(dev->mdev->priv.eswitch);
-
-       if (!nic_dev)
-               return -EINVAL;
-
-       dev->flow_db = nic_dev->flow_db;
-
-       return 0;
-}
-
 static void mlx5_ib_stage_flow_db_cleanup(struct mlx5_ib_dev *dev)
 {
        kfree(dev->flow_db);
@@ -5989,7 +6177,10 @@ static const struct ib_device_ops mlx5_ib_dev_ops = {
        .req_notify_cq = mlx5_ib_arm_cq,
        .rereg_user_mr = mlx5_ib_rereg_user_mr,
        .resize_cq = mlx5_ib_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, mlx5_ib_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, mlx5_ib_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, mlx5_ib_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mlx5_ib_ucontext, ibucontext),
 };
 
@@ -6025,7 +6216,7 @@ static const struct ib_device_ops mlx5_ib_dev_dm_ops = {
        .reg_dm_mr = mlx5_ib_reg_dm_mr,
 };
 
-int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
 {
        struct mlx5_core_dev *mdev = dev->mdev;
        int err;
@@ -6091,7 +6282,9 @@ int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev)
                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_xrc_ops);
        }
 
-       if (MLX5_CAP_DEV_MEM(mdev, memic))
+       if (MLX5_CAP_DEV_MEM(mdev, memic) ||
+           MLX5_CAP_GEN_64(dev->mdev, general_obj_types) &
+           MLX5_GENERAL_OBJ_TYPES_CAP_SW_ICM)
                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_dm_ops);
 
        if (mlx5_accel_ipsec_device_caps(dev->mdev) &
@@ -6131,7 +6324,7 @@ static const struct ib_device_ops mlx5_ib_dev_port_rep_ops = {
        .query_port = mlx5_ib_rep_query_port,
 };
 
-int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev)
 {
        ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_port_rep_ops);
        return 0;
@@ -6149,13 +6342,6 @@ static const struct ib_device_ops mlx5_ib_dev_common_roce_ops = {
 static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
 {
        u8 port_num;
-       int i;
-
-       for (i = 0; i < dev->num_ports; i++) {
-               dev->roce[i].dev = dev;
-               dev->roce[i].native_port_num = i + 1;
-               dev->roce[i].last_port_state = IB_PORT_DOWN;
-       }
 
        dev->ib_dev.uverbs_ex_cmd_mask |=
                        (1ull << IB_USER_VERBS_EX_CMD_CREATE_WQ) |
@@ -6167,6 +6353,7 @@ static int mlx5_ib_stage_common_roce_init(struct mlx5_ib_dev *dev)
 
        port_num = mlx5_core_native_port_num(dev->mdev) - 1;
 
+       /* Register only for native ports */
        return mlx5_add_netdev_notifier(dev, port_num);
 }
 
@@ -6177,7 +6364,7 @@ static void mlx5_ib_stage_common_roce_cleanup(struct mlx5_ib_dev *dev)
        mlx5_remove_netdev_notifier(dev, port_num);
 }
 
-int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
 {
        struct mlx5_core_dev *mdev = dev->mdev;
        enum rdma_link_layer ll;
@@ -6193,7 +6380,7 @@ int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev)
        return err;
 }
 
-void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev)
 {
        mlx5_ib_stage_common_roce_cleanup(dev);
 }
@@ -6240,12 +6427,12 @@ static void mlx5_ib_stage_roce_cleanup(struct mlx5_ib_dev *dev)
        }
 }
 
-int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev)
 {
        return create_dev_resources(&dev->devr);
 }
 
-void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev)
 {
        destroy_dev_resources(&dev->devr);
 }
@@ -6267,7 +6454,7 @@ static const struct ib_device_ops mlx5_ib_dev_hw_stats_ops = {
        .get_hw_stats = mlx5_ib_get_hw_stats,
 };
 
-int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
 {
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt)) {
                ib_set_device_ops(&dev->ib_dev, &mlx5_ib_dev_hw_stats_ops);
@@ -6278,7 +6465,7 @@ int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev)
        return 0;
 }
 
-void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev)
 {
        if (MLX5_CAP_GEN(dev->mdev, max_qp_cnt))
                mlx5_ib_dealloc_counters(dev);
@@ -6308,7 +6495,7 @@ static void mlx5_ib_stage_uar_cleanup(struct mlx5_ib_dev *dev)
        mlx5_put_uars_page(dev->mdev, dev->mdev->priv.uar);
 }
 
-int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
 {
        int err;
 
@@ -6323,13 +6510,13 @@ int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev)
        return err;
 }
 
-void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev)
 {
        mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg);
        mlx5_free_bfreg(dev->mdev, &dev->bfreg);
 }
 
-int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
 {
        const char *name;
 
@@ -6341,17 +6528,17 @@ int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev)
        return ib_register_device(&dev->ib_dev, name);
 }
 
-void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev)
 {
        destroy_umrc_res(dev);
 }
 
-void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
+static void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev)
 {
        ib_unregister_device(&dev->ib_dev);
 }
 
-int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
+static int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev)
 {
        return create_umr_res(dev);
 }
@@ -6406,6 +6593,9 @@ void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
                if (profile->stage[stage].cleanup)
                        profile->stage[stage].cleanup(dev);
        }
+
+       kfree(dev->port);
+       ib_dealloc_device(&dev->ib_dev);
 }
 
 void *__mlx5_ib_add(struct mlx5_ib_dev *dev,
@@ -6527,6 +6717,9 @@ const struct mlx5_ib_profile uplink_rep_profile = {
        STAGE_CREATE(MLX5_IB_STAGE_PRE_IB_REG_UMR,
                     NULL,
                     mlx5_ib_stage_pre_ib_reg_umr_cleanup),
+       STAGE_CREATE(MLX5_IB_STAGE_WHITELIST_UID,
+                    mlx5_ib_stage_devx_init,
+                    mlx5_ib_stage_devx_cleanup),
        STAGE_CREATE(MLX5_IB_STAGE_IB_REG,
                     mlx5_ib_stage_ib_reg_init,
                     mlx5_ib_stage_ib_reg_cleanup),
@@ -6581,12 +6774,14 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        enum rdma_link_layer ll;
        struct mlx5_ib_dev *dev;
        int port_type_cap;
+       int num_ports;
 
        printk_once(KERN_INFO "%s", mlx5_version);
 
        if (MLX5_ESWITCH_MANAGER(mdev) &&
            mlx5_ib_eswitch_mode(mdev->priv.eswitch) == SRIOV_OFFLOADS) {
-               mlx5_ib_register_vport_reps(mdev);
+               if (!mlx5_core_mp_enabled(mdev))
+                       mlx5_ib_register_vport_reps(mdev);
                return mdev;
        }
 
@@ -6596,13 +6791,20 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
        if (mlx5_core_is_mp_slave(mdev) && ll == IB_LINK_LAYER_ETHERNET)
                return mlx5_ib_add_slave_port(mdev);
 
+       num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
+                       MLX5_CAP_GEN(mdev, num_vhca_ports));
        dev = ib_alloc_device(mlx5_ib_dev, ib_dev);
        if (!dev)
                return NULL;
+       dev->port = kcalloc(num_ports, sizeof(*dev->port),
+                            GFP_KERNEL);
+       if (!dev->port) {
+               ib_dealloc_device((struct ib_device *)dev);
+               return NULL;
+       }
 
        dev->mdev = mdev;
-       dev->num_ports = max(MLX5_CAP_GEN(mdev, num_ports),
-                            MLX5_CAP_GEN(mdev, num_vhca_ports));
+       dev->num_ports = num_ports;
 
        return __mlx5_ib_add(dev, &pf_profile);
 }
@@ -6629,8 +6831,6 @@ static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
 
        dev = context;
        __mlx5_ib_remove(dev, dev->profile, MLX5_IB_STAGE_MAX);
-
-       ib_dealloc_device((struct ib_device *)dev);
 }
 
 static struct mlx5_interface mlx5_ib_interface = {
index 4a617d7..40eb8be 100644 (file)
@@ -48,6 +48,7 @@
 #include <rdma/mlx5-abi.h>
 #include <rdma/uverbs_ioctl.h>
 #include <rdma/mlx5_user_ioctl_cmds.h>
+#include <rdma/mlx5_user_ioctl_verbs.h>
 
 #include "srq.h"
 
@@ -117,6 +118,10 @@ enum {
        MLX5_MEMIC_BASE_SIZE    = 1 << MLX5_MEMIC_BASE_ALIGN,
 };
 
+#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev)                                        \
+       (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
+#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
+
 struct mlx5_ib_ucontext {
        struct ib_ucontext      ibucontext;
        struct list_head        db_page_list;
@@ -194,6 +199,7 @@ struct mlx5_ib_flow_db {
        struct mlx5_ib_flow_prio        egress_prios[MLX5_IB_NUM_FLOW_FT];
        struct mlx5_ib_flow_prio        sniffer[MLX5_IB_NUM_SNIFFER_FTS];
        struct mlx5_ib_flow_prio        egress[MLX5_IB_NUM_EGRESS_FTS];
+       struct mlx5_ib_flow_prio        fdb;
        struct mlx5_flow_table          *lag_demux_ft;
        /* Protect flow steering bypass flow tables
         * when add/del flow rules.
@@ -553,15 +559,28 @@ enum mlx5_ib_mtt_access_flags {
 struct mlx5_ib_dm {
        struct ib_dm            ibdm;
        phys_addr_t             dev_addr;
+       u32                     type;
+       size_t                  size;
+       union {
+               struct {
+                       u32     obj_id;
+               } icm_dm;
+               /* other dm types specific params should be added here */
+       };
 };
 
 #define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
 
-#define MLX5_IB_DM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
-                                  IB_ACCESS_REMOTE_WRITE  |\
-                                  IB_ACCESS_REMOTE_READ   |\
-                                  IB_ACCESS_REMOTE_ATOMIC |\
-                                  IB_ZERO_BASED)
+#define MLX5_IB_DM_MEMIC_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
+                                        IB_ACCESS_REMOTE_WRITE  |\
+                                        IB_ACCESS_REMOTE_READ   |\
+                                        IB_ACCESS_REMOTE_ATOMIC |\
+                                        IB_ZERO_BASED)
+
+#define MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS (IB_ACCESS_LOCAL_WRITE   |\
+                                         IB_ACCESS_REMOTE_WRITE  |\
+                                         IB_ACCESS_REMOTE_READ   |\
+                                         IB_ZERO_BASED)
 
 struct mlx5_ib_mr {
        struct ib_mr            ibmr;
@@ -702,12 +721,6 @@ struct mlx5_ib_multiport {
        spinlock_t mpi_lock;
 };
 
-struct mlx5_ib_port {
-       struct mlx5_ib_counters cnts;
-       struct mlx5_ib_multiport mp;
-       struct mlx5_ib_dbg_cc_params    *dbg_cc_params;
-};
-
 struct mlx5_roce {
        /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL
         * netdev pointer
@@ -721,6 +734,14 @@ struct mlx5_roce {
        u8                      native_port_num;
 };
 
+struct mlx5_ib_port {
+       struct mlx5_ib_counters cnts;
+       struct mlx5_ib_multiport mp;
+       struct mlx5_ib_dbg_cc_params *dbg_cc_params;
+       struct mlx5_roce roce;
+       struct mlx5_eswitch_rep         *rep;
+};
+
 struct mlx5_ib_dbg_param {
        int                     offset;
        struct mlx5_ib_dev      *dev;
@@ -840,10 +861,16 @@ struct mlx5_ib_flow_action {
        };
 };
 
-struct mlx5_memic {
+struct mlx5_dm {
        struct mlx5_core_dev *dev;
-       spinlock_t              memic_lock;
+       /* This lock is used to protect the access to the shared
+        * allocation map when concurrent requests by different
+        * processes are handled.
+        */
+       spinlock_t lock;
        DECLARE_BITMAP(memic_alloc_pages, MLX5_MAX_MEMIC_PAGES);
+       unsigned long *steering_sw_icm_alloc_blocks;
+       unsigned long *header_modify_sw_icm_alloc_blocks;
 };
 
 struct mlx5_read_counters_attr {
@@ -905,7 +932,6 @@ struct mlx5_ib_dev {
        struct ib_device                ib_dev;
        struct mlx5_core_dev            *mdev;
        struct notifier_block           mdev_events;
-       struct mlx5_roce                roce[MLX5_MAX_PORTS];
        int                             num_ports;
        /* serialize update of capability mask
         */
@@ -940,17 +966,18 @@ struct mlx5_ib_dev {
        struct mlx5_sq_bfreg    fp_bfreg;
        struct mlx5_ib_delay_drop       delay_drop;
        const struct mlx5_ib_profile    *profile;
-       struct mlx5_eswitch_rep         *rep;
+       bool                    is_rep;
        int                             lag_active;
 
        struct mlx5_ib_lb_state         lb;
        u8                      umr_fence;
        struct list_head        ib_dev_list;
        u64                     sys_image_guid;
-       struct mlx5_memic       memic;
+       struct mlx5_dm          dm;
        u16                     devx_whitelist_uid;
        struct mlx5_srq_table   srq_table;
        struct mlx5_async_ctx   async_ctx;
+       int                     free_port;
 };
 
 static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
@@ -968,6 +995,14 @@ static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev)
        return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
 }
 
+static inline struct mlx5_ib_dev *mlx5_udata_to_mdev(struct ib_udata *udata)
+{
+       struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mlx5_ib_ucontext, ibucontext);
+
+       return to_mdev(context->ibucontext.device);
+}
+
 static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
 {
        return container_of(ibcq, struct mlx5_ib_cq, ibcq);
@@ -1046,17 +1081,16 @@ void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db)
 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
-struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                               u32 flags, struct ib_udata *udata);
+int mlx5_ib_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+                     struct ib_udata *udata);
 int mlx5_ib_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
-int mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags);
-struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
-                                 struct ib_srq_init_attr *init_attr,
-                                 struct ib_udata *udata);
+void mlx5_ib_destroy_ah(struct ib_ah *ah, u32 flags);
+int mlx5_ib_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+                      struct ib_udata *udata);
 int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
-int mlx5_ib_destroy_srq(struct ib_srq *srq);
+void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                          const struct ib_recv_wr **bad_wr);
 int mlx5_ib_enable_lb(struct mlx5_ib_dev *dev, bool td, bool qp);
@@ -1068,7 +1102,7 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                      int attr_mask, struct ib_udata *udata);
 int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
                     struct ib_qp_init_attr *qp_init_attr);
-int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 void mlx5_ib_drain_sq(struct ib_qp *qp);
 void mlx5_ib_drain_rq(struct ib_qp *qp);
 int mlx5_ib_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
@@ -1083,9 +1117,8 @@ int mlx5_ib_read_user_wqe_srq(struct mlx5_ib_srq *srq, int wqe_index,
                              void *buffer, int buflen, size_t *bc);
 struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev,
                                const struct ib_cq_init_attr *attr,
-                               struct ib_ucontext *context,
                                struct ib_udata *udata);
-int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
@@ -1112,10 +1145,9 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *mr);
 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
                          u64 length, u64 virt_addr, int access_flags,
                          struct ib_pd *pd, struct ib_udata *udata);
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
-                              enum ib_mr_type mr_type,
-                              u32 max_num_sg);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata);
 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                      unsigned int *sg_offset);
 int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
@@ -1124,9 +1156,8 @@ int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
                        struct ib_mad_hdr *out, size_t *out_mad_size,
                        u16 *out_mad_pkey_index);
 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-                                         struct ib_ucontext *context,
-                                         struct ib_udata *udata);
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+                                  struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
 int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
 int mlx5_query_mad_ifc_smp_attr_node_info(struct ib_device *ibdev,
@@ -1170,7 +1201,7 @@ int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
 struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
                                struct ib_wq_init_attr *init_attr,
                                struct ib_udata *udata);
-int mlx5_ib_destroy_wq(struct ib_wq *wq);
+int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int mlx5_ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *wq_attr,
                      u32 wq_attr_mask, struct ib_udata *udata);
 struct ib_rwq_ind_table *mlx5_ib_create_rwq_ind_table(struct ib_device *device,
@@ -1182,7 +1213,7 @@ struct ib_dm *mlx5_ib_alloc_dm(struct ib_device *ibdev,
                               struct ib_ucontext *context,
                               struct ib_dm_alloc_attr *attr,
                               struct uverbs_attr_bundle *attrs);
-int mlx5_ib_dealloc_dm(struct ib_dm *ibdm);
+int mlx5_ib_dealloc_dm(struct ib_dm *ibdm, struct uverbs_attr_bundle *attrs);
 struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
                                struct ib_dm_mr_attr *attr,
                                struct uverbs_attr_bundle *attrs);
@@ -1230,23 +1261,6 @@ static inline void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp,
 #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
 
 /* Needed for rep profile */
-int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_init_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_flow_db_init(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_caps_init(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_non_default_cb(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_rep_roce_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_rep_roce_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_dev_res_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_dev_res_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_counters_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_counters_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_bfrag_cleanup(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_pre_ib_reg_umr_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_ib_reg_init(struct mlx5_ib_dev *dev);
-void mlx5_ib_stage_ib_reg_cleanup(struct mlx5_ib_dev *dev);
-int mlx5_ib_stage_post_ib_reg_umr_init(struct mlx5_ib_dev *dev);
 void __mlx5_ib_remove(struct mlx5_ib_dev *dev,
                      const struct mlx5_ib_profile *profile,
                      int stage);
index ca921fd..5f09699 100644 (file)
@@ -600,7 +600,7 @@ static void clean_keys(struct mlx5_ib_dev *dev, int c)
 
 static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
 {
-       if (!mlx5_debugfs_root || dev->rep)
+       if (!mlx5_debugfs_root || dev->is_rep)
                return;
 
        debugfs_remove_recursive(dev->cache.root);
@@ -614,7 +614,7 @@ static void mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
        struct dentry *dir;
        int i;
 
-       if (!mlx5_debugfs_root || dev->rep)
+       if (!mlx5_debugfs_root || dev->is_rep)
                return;
 
        cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
@@ -677,7 +677,7 @@ int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
                           MLX5_IB_UMR_OCTOWORD;
                ent->access_mode = MLX5_MKC_ACCESS_MODE_MTT;
                if ((dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE) &&
-                   !dev->rep &&
+                   !dev->is_rep &&
                    mlx5_core_is_pf(dev->mdev))
                        ent->limit = dev->mdev->profile->mr_cache[i].limit;
                else
@@ -1159,8 +1159,8 @@ static void set_mr_fields(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
        mr->access_flags = access_flags;
 }
 
-static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
-                                         u64 length, int acc)
+static struct ib_mr *mlx5_ib_get_dm_mr(struct ib_pd *pd, u64 start_addr,
+                                      u64 length, int acc, int mode)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
@@ -1182,9 +1182,8 @@ static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
 
        mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
 
-       MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MEMIC & 0x3);
-       MLX5_SET(mkc, mkc, access_mode_4_2,
-                (MLX5_MKC_ACCESS_MODE_MEMIC >> 2) & 0x7);
+       MLX5_SET(mkc, mkc, access_mode_1_0, mode & 0x3);
+       MLX5_SET(mkc, mkc, access_mode_4_2, (mode >> 2) & 0x7);
        MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
        MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
        MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
@@ -1194,7 +1193,7 @@ static struct ib_mr *mlx5_ib_get_memic_mr(struct ib_pd *pd, u64 memic_addr,
        MLX5_SET64(mkc, mkc, len, length);
        MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
        MLX5_SET(mkc, mkc, qpn, 0xffffff);
-       MLX5_SET64(mkc, mkc, start_addr, memic_addr - dev->mdev->bar_addr);
+       MLX5_SET64(mkc, mkc, start_addr, start_addr);
 
        err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
        if (err)
@@ -1236,15 +1235,31 @@ struct ib_mr *mlx5_ib_reg_dm_mr(struct ib_pd *pd, struct ib_dm *dm,
                                struct uverbs_attr_bundle *attrs)
 {
        struct mlx5_ib_dm *mdm = to_mdm(dm);
-       u64 memic_addr;
+       struct mlx5_core_dev *dev = to_mdev(dm->device)->mdev;
+       u64 start_addr = mdm->dev_addr + attr->offset;
+       int mode;
 
-       if (attr->access_flags & ~MLX5_IB_DM_ALLOWED_ACCESS)
-               return ERR_PTR(-EINVAL);
+       switch (mdm->type) {
+       case MLX5_IB_UAPI_DM_TYPE_MEMIC:
+               if (attr->access_flags & ~MLX5_IB_DM_MEMIC_ALLOWED_ACCESS)
+                       return ERR_PTR(-EINVAL);
+
+               mode = MLX5_MKC_ACCESS_MODE_MEMIC;
+               start_addr -= pci_resource_start(dev->pdev, 0);
+               break;
+       case MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM:
+       case MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM:
+               if (attr->access_flags & ~MLX5_IB_DM_SW_ICM_ALLOWED_ACCESS)
+                       return ERR_PTR(-EINVAL);
 
-       memic_addr = mdm->dev_addr + attr->offset;
+               mode = MLX5_MKC_ACCESS_MODE_SW_ICM;
+               break;
+       default:
+               return ERR_PTR(-EINVAL);
+       }
 
-       return mlx5_ib_get_memic_mr(pd, memic_addr, attr->length,
-                                   attr->access_flags);
+       return mlx5_ib_get_dm_mr(pd, start_addr, attr->length,
+                                attr->access_flags, mode);
 }
 
 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
@@ -1622,15 +1637,14 @@ static void dereg_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
                kfree(mr);
 }
 
-int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        dereg_mr(to_mdev(ibmr->device), to_mmr(ibmr));
        return 0;
 }
 
-struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
-                              enum ib_mr_type mr_type,
-                              u32 max_num_sg)
+struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(pd->device);
        int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
index 0aa10eb..91507a2 100644 (file)
@@ -288,7 +288,7 @@ void mlx5_ib_invalidate_range(struct ib_umem_odp *umem_odp, unsigned long start,
 
        ib_umem_odp_unmap_dma_pages(umem_odp, start, end);
 
-       if (unlikely(!umem->npages && mr->parent &&
+       if (unlikely(!umem_odp->npages && mr->parent &&
                     !umem_odp->dying)) {
                WRITE_ONCE(umem_odp->dying, 1);
                atomic_inc(&mr->parent->num_leaf_free);
@@ -711,6 +711,15 @@ struct pf_frame {
        int depth;
 };
 
+static bool mkey_is_eq(struct mlx5_core_mkey *mmkey, u32 key)
+{
+       if (!mmkey)
+               return false;
+       if (mmkey->type == MLX5_MKEY_MW)
+               return mlx5_base_mkey(mmkey->key) == mlx5_base_mkey(key);
+       return mmkey->key == key;
+}
+
 static int get_indirect_num_descs(struct mlx5_core_mkey *mmkey)
 {
        struct mlx5_ib_mw *mw;
@@ -760,7 +769,7 @@ static int pagefault_single_data_segment(struct mlx5_ib_dev *dev,
 
 next_mr:
        mmkey = __mlx5_mr_lookup(dev->mdev, mlx5_base_mkey(key));
-       if (!mmkey || mmkey->key != key) {
+       if (!mkey_is_eq(mmkey, key)) {
                mlx5_ib_dbg(dev, "failed to find mkey %x\n", key);
                ret = -EFAULT;
                goto srcu_unlock;
@@ -920,7 +929,7 @@ static int pagefault_data_segments(struct mlx5_ib_dev *dev,
                                   struct mlx5_pagefault *pfault,
                                   void *wqe,
                                   void *wqe_end, u32 *bytes_mapped,
-                                  u32 *total_wqe_bytes, int receive_queue)
+                                  u32 *total_wqe_bytes, bool receive_queue)
 {
        int ret = 0, npages = 0;
        u64 io_virt;
@@ -1200,17 +1209,15 @@ static inline struct mlx5_ib_srq *res_to_srq(struct mlx5_core_rsc_common *res)
 static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
                                          struct mlx5_pagefault *pfault)
 {
-       int ret;
-       void *wqe, *wqe_end;
+       bool sq = pfault->type & MLX5_PFAULT_REQUESTOR;
+       u16 wqe_index = pfault->wqe.wqe_index;
+       void *wqe = NULL, *wqe_end = NULL;
        u32 bytes_mapped, total_wqe_bytes;
-       char *buffer = NULL;
+       struct mlx5_core_rsc_common *res;
        int resume_with_error = 1;
-       u16 wqe_index = pfault->wqe.wqe_index;
-       int requestor = pfault->type & MLX5_PFAULT_REQUESTOR;
-       struct mlx5_core_rsc_common *res = NULL;
-       struct mlx5_ib_qp *qp = NULL;
-       struct mlx5_ib_srq *srq = NULL;
+       struct mlx5_ib_qp *qp;
        size_t bytes_copied;
+       int ret = 0;
 
        res = odp_get_rsc(dev, pfault->wqe.wq_num, pfault->type);
        if (!res) {
@@ -1218,87 +1225,74 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_dev *dev,
                return;
        }
 
-       switch (res->res) {
-       case MLX5_RES_QP:
-               qp = res_to_qp(res);
-               break;
-       case MLX5_RES_SRQ:
-       case MLX5_RES_XSRQ:
-               srq = res_to_srq(res);
-               break;
-       default:
-               mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n", pfault->type);
+       if (res->res != MLX5_RES_QP && res->res != MLX5_RES_SRQ &&
+           res->res != MLX5_RES_XSRQ) {
+               mlx5_ib_err(dev, "wqe page fault for unsupported type %d\n",
+                           pfault->type);
                goto resolve_page_fault;
        }
 
-       buffer = (char *)__get_free_page(GFP_KERNEL);
-       if (!buffer) {
+       wqe = (void *)__get_free_page(GFP_KERNEL);
+       if (!wqe) {
                mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
                goto resolve_page_fault;
        }
 
-       if (qp) {
-               if (requestor) {
-                       ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index,
-                                       buffer, PAGE_SIZE,
-                                       &bytes_copied);
-               } else {
-                       ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index,
-                                       buffer, PAGE_SIZE,
-                                       &bytes_copied);
-               }
-       } else {
-               ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index,
-                                               buffer, PAGE_SIZE,
+       qp = (res->res == MLX5_RES_QP) ? res_to_qp(res) : NULL;
+       if (qp && sq) {
+               ret = mlx5_ib_read_user_wqe_sq(qp, wqe_index, wqe, PAGE_SIZE,
+                                              &bytes_copied);
+               if (ret)
+                       goto read_user;
+               ret = mlx5_ib_mr_initiator_pfault_handler(
+                       dev, pfault, qp, &wqe, &wqe_end, bytes_copied);
+       } else if (qp && !sq) {
+               ret = mlx5_ib_read_user_wqe_rq(qp, wqe_index, wqe, PAGE_SIZE,
+                                              &bytes_copied);
+               if (ret)
+                       goto read_user;
+               ret = mlx5_ib_mr_responder_pfault_handler_rq(
+                       dev, qp, wqe, &wqe_end, bytes_copied);
+       } else if (!qp) {
+               struct mlx5_ib_srq *srq = res_to_srq(res);
+
+               ret = mlx5_ib_read_user_wqe_srq(srq, wqe_index, wqe, PAGE_SIZE,
                                                &bytes_copied);
+               if (ret)
+                       goto read_user;
+               ret = mlx5_ib_mr_responder_pfault_handler_srq(
+                       dev, srq, &wqe, &wqe_end, bytes_copied);
        }
 
-       if (ret) {
-               mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%d, wqe_index=%x, qpn=%x\n",
-                           ret, wqe_index, pfault->token);
+       if (ret < 0 || wqe >= wqe_end)
                goto resolve_page_fault;
-       }
 
-       wqe = buffer;
-       if (requestor)
-               ret = mlx5_ib_mr_initiator_pfault_handler(dev, pfault, qp,
-                                                         &wqe,  &wqe_end,
-                                                         bytes_copied);
-       else if (qp)
-               ret = mlx5_ib_mr_responder_pfault_handler_rq(dev, qp,
-                                                            wqe, &wqe_end,
-                                                            bytes_copied);
-       else
-               ret = mlx5_ib_mr_responder_pfault_handler_srq(dev, srq,
-                                                             &wqe, &wqe_end,
-                                                             bytes_copied);
+       ret = pagefault_data_segments(dev, pfault, wqe, wqe_end, &bytes_mapped,
+                                     &total_wqe_bytes, !sq);
+       if (ret == -EAGAIN)
+               goto out;
 
-       if (ret < 0)
+       if (ret < 0 || total_wqe_bytes > bytes_mapped)
                goto resolve_page_fault;
 
-       if (wqe >= wqe_end) {
-               mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
-               goto resolve_page_fault;
-       }
+out:
+       ret = 0;
+       resume_with_error = 0;
 
-       ret = pagefault_data_segments(dev, pfault, wqe, wqe_end,
-                                     &bytes_mapped, &total_wqe_bytes,
-                                     !requestor);
-       if (ret == -EAGAIN) {
-               resume_with_error = 0;
-               goto resolve_page_fault;
-       } else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
-               goto resolve_page_fault;
-       }
+read_user:
+       if (ret)
+               mlx5_ib_err(
+                       dev,
+                       "Failed reading a WQE following page fault, error %d, wqe_index %x, qpn %x\n",
+                       ret, wqe_index, pfault->token);
 
-       resume_with_error = 0;
 resolve_page_fault:
        mlx5_ib_page_fault_resume(dev, pfault, resume_with_error);
        mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, type: 0x%x\n",
                    pfault->wqe.wq_num, resume_with_error,
                    pfault->type);
        mlx5_core_res_put(res);
-       free_page((unsigned long)buffer);
+       free_page((unsigned long)wqe);
 }
 
 static int pages_in_range(u64 address, u32 length)
index 581144e..f6623c7 100644 (file)
@@ -92,6 +92,7 @@ struct mlx5_modify_raw_qp_param {
        struct mlx5_rate_limit rl;
 
        u8 rq_q_ctr_id;
+       u16 port;
 };
 
 static void get_cqs(enum ib_qp_type qp_type,
@@ -777,14 +778,17 @@ err_umem:
 }
 
 static void destroy_user_rq(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-                           struct mlx5_ib_rwq *rwq)
+                           struct mlx5_ib_rwq *rwq, struct ib_udata *udata)
 {
-       struct mlx5_ib_ucontext *context;
+       struct mlx5_ib_ucontext *context =
+               rdma_udata_to_drv_context(
+                       udata,
+                       struct mlx5_ib_ucontext,
+                       ibucontext);
 
        if (rwq->create_flags & MLX5_IB_WQ_FLAGS_DELAY_DROP)
                atomic_dec(&dev->delay_drop.rqs_cnt);
 
-       context = to_mucontext(pd->uobject->context);
        mlx5_ib_db_unmap_user(context, &rwq->db);
        if (rwq->umem)
                ib_umem_release(rwq->umem);
@@ -983,11 +987,15 @@ err_bfreg:
 }
 
 static void destroy_qp_user(struct mlx5_ib_dev *dev, struct ib_pd *pd,
-                           struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base)
+                           struct mlx5_ib_qp *qp, struct mlx5_ib_qp_base *base,
+                           struct ib_udata *udata)
 {
-       struct mlx5_ib_ucontext *context;
+       struct mlx5_ib_ucontext *context =
+               rdma_udata_to_drv_context(
+                       udata,
+                       struct mlx5_ib_ucontext,
+                       ibucontext);
 
-       context = to_mucontext(pd->uobject->context);
        mlx5_ib_db_unmap_user(context, &qp->db);
        if (base->ubuffer.umem)
                ib_umem_release(base->ubuffer.umem);
@@ -1206,11 +1214,11 @@ static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev,
        mlx5_cmd_destroy_tis(dev->mdev, sq->tisn, to_mpd(pd)->uid);
 }
 
-static void destroy_flow_rule_vport_sq(struct mlx5_ib_dev *dev,
-                                      struct mlx5_ib_sq *sq)
+static void destroy_flow_rule_vport_sq(struct mlx5_ib_sq *sq)
 {
        if (sq->flow_rule)
                mlx5_del_flow_rules(sq->flow_rule);
+       sq->flow_rule = NULL;
 }
 
 static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
@@ -1278,15 +1286,8 @@ static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
        if (err)
                goto err_umem;
 
-       err = create_flow_rule_vport_sq(dev, sq);
-       if (err)
-               goto err_flow;
-
        return 0;
 
-err_flow:
-       mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
-
 err_umem:
        ib_umem_release(sq->ubuffer.umem);
        sq->ubuffer.umem = NULL;
@@ -1297,7 +1298,7 @@ err_umem:
 static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev,
                                     struct mlx5_ib_sq *sq)
 {
-       destroy_flow_rule_vport_sq(dev, sq);
+       destroy_flow_rule_vport_sq(sq);
        mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp);
        ib_umem_release(sq->ubuffer.umem);
 }
@@ -1402,7 +1403,8 @@ static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
 static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
                                    struct mlx5_ib_rq *rq, u32 tdn,
                                    u32 *qp_flags_en,
-                                   struct ib_pd *pd)
+                                   struct ib_pd *pd,
+                                   u32 *out, int outlen)
 {
        u8 lb_flag = 0;
        u32 *in;
@@ -1429,15 +1431,16 @@ static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev,
        if (*qp_flags_en & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_MC)
                lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_MULTICAST;
 
-       if (dev->rep) {
+       if (dev->is_rep) {
                lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
                *qp_flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
        }
 
        MLX5_SET(tirc, tirc, self_lb_block, lb_flag);
 
-       err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn);
+       err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
 
+       rq->tirn = MLX5_GET(create_tir_out, out, tirn);
        if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
                err = mlx5_ib_enable_lb(dev, false, true);
 
@@ -1463,6 +1466,7 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
        int err;
        u32 tdn = mucontext->tdn;
        u16 uid = to_mpd(pd)->uid;
+       u32 out[MLX5_ST_SZ_DW(create_tir_out)] = {};
 
        if (qp->sq.wqe_cnt) {
                err = create_raw_packet_qp_tis(dev, qp, sq, tdn, pd);
@@ -1495,7 +1499,9 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                if (err)
                        goto err_destroy_sq;
 
-               err = create_raw_packet_qp_tir(dev, rq, tdn, &qp->flags_en, pd);
+               err = create_raw_packet_qp_tir(
+                       dev, rq, tdn, &qp->flags_en, pd, out,
+                       MLX5_ST_SZ_BYTES(create_tir_out));
                if (err)
                        goto err_destroy_rq;
 
@@ -1504,6 +1510,20 @@ static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                        resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_RQN;
                        resp->tirn = rq->tirn;
                        resp->comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
+                       if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
+                               resp->tir_icm_addr = MLX5_GET(
+                                       create_tir_out, out, icm_address_31_0);
+                               resp->tir_icm_addr |=
+                                       (u64)MLX5_GET(create_tir_out, out,
+                                                     icm_address_39_32)
+                                       << 32;
+                               resp->tir_icm_addr |=
+                                       (u64)MLX5_GET(create_tir_out, out,
+                                                     icm_address_63_40)
+                                       << 40;
+                               resp->comp_mask |=
+                                       MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR;
+                       }
                }
        }
 
@@ -1577,8 +1597,10 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                udata, struct mlx5_ib_ucontext, ibucontext);
        struct mlx5_ib_create_qp_resp resp = {};
        int inlen;
+       int outlen;
        int err;
        u32 *in;
+       u32 *out;
        void *tirc;
        void *hfso;
        u32 selected_fields = 0;
@@ -1641,7 +1663,7 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                return -EOPNOTSUPP;
        }
 
-       if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->rep) {
+       if (ucmd.flags & MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC || dev->is_rep) {
                lb_flag |= MLX5_TIRC_SELF_LB_BLOCK_BLOCK_UNICAST;
                qp->flags_en |= MLX5_QP_FLAG_TIR_ALLOW_SELF_LB_UC;
        }
@@ -1658,10 +1680,12 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
        }
 
        inlen = MLX5_ST_SZ_BYTES(create_tir_in);
-       in = kvzalloc(inlen, GFP_KERNEL);
+       outlen = MLX5_ST_SZ_BYTES(create_tir_out);
+       in = kvzalloc(inlen + outlen, GFP_KERNEL);
        if (!in)
                return -ENOMEM;
 
+       out = in + MLX5_ST_SZ_DW(create_tir_in);
        MLX5_SET(create_tir_in, in, uid, to_mpd(pd)->uid);
        tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
        MLX5_SET(tirc, tirc, disp_type,
@@ -1773,8 +1797,9 @@ static int create_rss_raw_qp_tir(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
        MLX5_SET(rx_hash_field_select, hfso, selected_fields, selected_fields);
 
 create_tir:
-       err = mlx5_core_create_tir(dev->mdev, in, inlen, &qp->rss_qp.tirn);
+       err = mlx5_core_create_tir_out(dev->mdev, in, inlen, out, outlen);
 
+       qp->rss_qp.tirn = MLX5_GET(create_tir_out, out, tirn);
        if (!err && MLX5_GET(tirc, tirc, self_lb_block)) {
                err = mlx5_ib_enable_lb(dev, false, true);
 
@@ -1789,6 +1814,18 @@ create_tir:
        if (mucontext->devx_uid) {
                resp.comp_mask |= MLX5_IB_CREATE_QP_RESP_MASK_TIRN;
                resp.tirn = qp->rss_qp.tirn;
+               if (MLX5_CAP_FLOWTABLE_NIC_RX(dev->mdev, sw_owner)) {
+                       resp.tir_icm_addr =
+                               MLX5_GET(create_tir_out, out, icm_address_31_0);
+                       resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
+                                                          icm_address_39_32)
+                                            << 32;
+                       resp.tir_icm_addr |= (u64)MLX5_GET(create_tir_out, out,
+                                                          icm_address_63_40)
+                                            << 40;
+                       resp.comp_mask |=
+                               MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR;
+               }
        }
 
        err = ib_copy_to_udata(udata, &resp, min(udata->outlen, sizeof(resp)));
@@ -2287,7 +2324,7 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
 
 err_create:
        if (qp->create_type == MLX5_QP_USER)
-               destroy_qp_user(dev, pd, qp, base);
+               destroy_qp_user(dev, pd, qp, base, udata);
        else if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);
 
@@ -2398,7 +2435,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                const struct mlx5_modify_raw_qp_param *raw_qp_param,
                                u8 lag_tx_affinity);
 
-static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
+                             struct ib_udata *udata)
 {
        struct mlx5_ib_cq *send_cq, *recv_cq;
        struct mlx5_ib_qp_base *base;
@@ -2469,7 +2507,7 @@ static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
        if (qp->create_type == MLX5_QP_KERNEL)
                destroy_qp_kernel(dev, qp);
        else if (qp->create_type == MLX5_QP_USER)
-               destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base);
+               destroy_qp_user(dev, &get_pd(qp)->ibpd, qp, base, udata);
 }
 
 static const char *ib_qp_type_str(enum ib_qp_type type)
@@ -2735,7 +2773,7 @@ static int mlx5_ib_destroy_dct(struct mlx5_ib_qp *mqp)
        return 0;
 }
 
-int mlx5_ib_destroy_qp(struct ib_qp *qp)
+int mlx5_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(qp->device);
        struct mlx5_ib_qp *mqp = to_mqp(qp);
@@ -2746,7 +2784,7 @@ int mlx5_ib_destroy_qp(struct ib_qp *qp)
        if (mqp->qp_sub_type == MLX5_IB_QPT_DCT)
                return mlx5_ib_destroy_dct(mqp);
 
-       destroy_qp_common(dev, mqp);
+       destroy_qp_common(dev, mqp, udata);
 
        kfree(mqp);
 
@@ -2964,6 +3002,11 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
                                          MLX5_QP_OPTPAR_Q_KEY          |
                                          MLX5_QP_OPTPAR_PRI_PORT,
+                       [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE           |
+                                         MLX5_QP_OPTPAR_RAE            |
+                                         MLX5_QP_OPTPAR_RWE            |
+                                         MLX5_QP_OPTPAR_PKEY_INDEX     |
+                                         MLX5_QP_OPTPAR_PRI_PORT,
                },
                [MLX5_QP_STATE_RTR] = {
                        [MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
@@ -2997,6 +3040,12 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q
                                          MLX5_QP_OPTPAR_RWE            |
                                          MLX5_QP_OPTPAR_PM_STATE,
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+                       [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
+                                         MLX5_QP_OPTPAR_RRE            |
+                                         MLX5_QP_OPTPAR_RAE            |
+                                         MLX5_QP_OPTPAR_RWE            |
+                                         MLX5_QP_OPTPAR_PM_STATE       |
+                                         MLX5_QP_OPTPAR_RNR_TIMEOUT,
                },
        },
        [MLX5_QP_STATE_RTS] = {
@@ -3013,6 +3062,12 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q
                        [MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY          |
                                          MLX5_QP_OPTPAR_SRQN           |
                                          MLX5_QP_OPTPAR_CQN_RCV,
+                       [MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_RRE           |
+                                         MLX5_QP_OPTPAR_RAE            |
+                                         MLX5_QP_OPTPAR_RWE            |
+                                         MLX5_QP_OPTPAR_RNR_TIMEOUT    |
+                                         MLX5_QP_OPTPAR_PM_STATE       |
+                                         MLX5_QP_OPTPAR_ALT_ADDR_PATH,
                },
        },
        [MLX5_QP_STATE_SQER] = {
@@ -3024,6 +3079,10 @@ static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_Q
                                           MLX5_QP_OPTPAR_RWE           |
                                           MLX5_QP_OPTPAR_RAE           |
                                           MLX5_QP_OPTPAR_RRE,
+                       [MLX5_QP_ST_XRC]  = MLX5_QP_OPTPAR_RNR_TIMEOUT  |
+                                          MLX5_QP_OPTPAR_RWE           |
+                                          MLX5_QP_OPTPAR_RAE           |
+                                          MLX5_QP_OPTPAR_RRE,
                },
        },
 };
@@ -3264,6 +3323,8 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
        }
 
        if (modify_sq) {
+               struct mlx5_flow_handle *flow_rule;
+
                if (tx_affinity) {
                        err = modify_raw_packet_tx_affinity(dev->mdev, sq,
                                                            tx_affinity,
@@ -3272,8 +3333,25 @@ static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp,
                                return err;
                }
 
-               return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state,
-                                              raw_qp_param, qp->ibqp.pd);
+               flow_rule = create_flow_rule_vport_sq(dev, sq,
+                                                     raw_qp_param->port);
+               if (IS_ERR(flow_rule))
+                       return PTR_ERR(flow_rule);
+
+               err = modify_raw_packet_qp_sq(dev->mdev, sq, sq_state,
+                                             raw_qp_param, qp->ibqp.pd);
+               if (err) {
+                       if (flow_rule)
+                               mlx5_del_flow_rules(flow_rule);
+                       return err;
+               }
+
+               if (flow_rule) {
+                       destroy_flow_rule_vport_sq(sq);
+                       sq->flow_rule = flow_rule;
+               }
+
+               return err;
        }
 
        return 0;
@@ -3298,7 +3376,7 @@ static unsigned int get_tx_affinity(struct mlx5_ib_dev *dev,
        } else {
                tx_port_affinity =
                        (unsigned int)atomic_add_return(
-                               1, &dev->roce[port_num].tx_port_affinity) %
+                               1, &dev->port[port_num].roce.tx_port_affinity) %
                                MLX5_MAX_PORTS +
                        1;
                mlx5_ib_dbg(dev, "Set tx affinity 0x%x to qpn 0x%x\n",
@@ -3403,7 +3481,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                    (ibqp->qp_type == IB_QPT_XRC_INI) ||
                    (ibqp->qp_type == IB_QPT_XRC_TGT)) {
                        if (dev->lag_active) {
-                               u8 p = mlx5_core_native_port_num(dev->mdev);
+                               u8 p = mlx5_core_native_port_num(dev->mdev) - 1;
                                tx_affinity = get_tx_affinity(dev, pd, base, p,
                                                              udata);
                                context->flags |= cpu_to_be32(tx_affinity << 24);
@@ -3556,6 +3634,9 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
                        raw_qp_param.set_mask |= MLX5_RAW_QP_MOD_SET_RQ_Q_CTR_ID;
                }
 
+               if (attr_mask & IB_QP_PORT)
+                       raw_qp_param.port = attr->port_num;
+
                if (attr_mask & IB_QP_RATE_LIMIT) {
                        raw_qp_param.rl.rate = attr->rate_limit;
 
@@ -4729,16 +4810,15 @@ static void set_linv_wr(struct mlx5_ib_qp *qp, void **seg, int *size,
 static void dump_wqe(struct mlx5_ib_qp *qp, u32 idx, int size_16)
 {
        __be32 *p = NULL;
-       u32 tidx = idx;
        int i, j;
 
        pr_debug("dump WQE index %u:\n", idx);
        for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
                if ((i & 0xf) == 0) {
-                       tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
-                       p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, tidx);
+                       p = mlx5_frag_buf_get_wqe(&qp->sq.fbc, idx);
                        pr_debug("WQBB at %p:\n", (void *)p);
                        j = 0;
+                       idx = (idx + 1) & (qp->sq.wqe_cnt - 1);
                }
                pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
                         be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
@@ -5627,8 +5707,7 @@ out:
 }
 
 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
-                                         struct ib_ucontext *context,
-                                         struct ib_udata *udata)
+                                  struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(ibdev);
        struct mlx5_ib_xrcd *xrcd;
@@ -5650,7 +5729,7 @@ struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
        return &xrcd->ibxrcd;
 }
 
-int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
        u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
@@ -5962,19 +6041,19 @@ struct ib_wq *mlx5_ib_create_wq(struct ib_pd *pd,
 err_copy:
        mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
 err_user_rq:
-       destroy_user_rq(dev, pd, rwq);
+       destroy_user_rq(dev, pd, rwq, udata);
 err:
        kfree(rwq);
        return ERR_PTR(err);
 }
 
-int mlx5_ib_destroy_wq(struct ib_wq *wq)
+int mlx5_ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(wq->device);
        struct mlx5_ib_rwq *rwq = to_mrwq(wq);
 
        mlx5_core_destroy_rq_tracked(dev->mdev, &rwq->core_qp);
-       destroy_user_rq(dev, wq->pd, rwq);
+       destroy_user_rq(dev, wq->pd, rwq, udata);
        kfree(rwq);
 
        return 0;
index 1ec1beb..4e7fde8 100644 (file)
@@ -194,9 +194,15 @@ err_db:
        return err;
 }
 
-static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+                            struct ib_udata *udata)
 {
-       mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+       mlx5_ib_db_unmap_user(
+               rdma_udata_to_drv_context(
+                       udata,
+                       struct mlx5_ib_ucontext,
+                       ibucontext),
+               &srq->db);
        ib_umem_release(srq->umem);
 }
 
@@ -208,16 +214,16 @@ static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
        mlx5_db_free(dev->mdev, &srq->db);
 }
 
-struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
-                                 struct ib_srq_init_attr *init_attr,
-                                 struct ib_udata *udata)
+int mlx5_ib_create_srq(struct ib_srq *ib_srq,
+                      struct ib_srq_init_attr *init_attr,
+                      struct ib_udata *udata)
 {
-       struct mlx5_ib_dev *dev = to_mdev(pd->device);
-       struct mlx5_ib_srq *srq;
+       struct mlx5_ib_dev *dev = to_mdev(ib_srq->device);
+       struct mlx5_ib_srq *srq = to_msrq(ib_srq);
        size_t desc_size;
        size_t buf_size;
        int err;
-       struct mlx5_srq_attr in = {0};
+       struct mlx5_srq_attr in = {};
        __u32 max_srq_wqes = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz);
 
        /* Sanity check SRQ size before proceeding */
@@ -225,13 +231,9 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
                mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
                            init_attr->attr.max_wr,
                            max_srq_wqes);
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
-       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
-
        mutex_init(&srq->mutex);
        spin_lock_init(&srq->lock);
        srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
@@ -239,35 +241,32 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 
        desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
                    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
-       if (desc_size == 0 || srq->msrq.max_gs > desc_size) {
-               err = -EINVAL;
-               goto err_srq;
-       }
+       if (desc_size == 0 || srq->msrq.max_gs > desc_size)
+               return -EINVAL;
+
        desc_size = roundup_pow_of_two(desc_size);
        desc_size = max_t(size_t, 32, desc_size);
-       if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg)) {
-               err = -EINVAL;
-               goto err_srq;
-       }
+       if (desc_size < sizeof(struct mlx5_wqe_srq_next_seg))
+               return -EINVAL;
+
        srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
                sizeof(struct mlx5_wqe_data_seg);
        srq->msrq.wqe_shift = ilog2(desc_size);
        buf_size = srq->msrq.max * desc_size;
-       if (buf_size < desc_size) {
-               err = -EINVAL;
-               goto err_srq;
-       }
+       if (buf_size < desc_size)
+               return -EINVAL;
+
        in.type = init_attr->srq_type;
 
        if (udata)
-               err = create_srq_user(pd, srq, &in, udata, buf_size);
+               err = create_srq_user(ib_srq->pd, srq, &in, udata, buf_size);
        else
                err = create_srq_kernel(dev, srq, &in, buf_size);
 
        if (err) {
                mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
                             udata ? "user" : "kernel", err);
-               goto err_srq;
+               return err;
        }
 
        in.log_size = ilog2(srq->msrq.max);
@@ -297,7 +296,7 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
        else
                in.cqn = to_mcq(dev->devr.c0)->mcq.cqn;
 
-       in.pd = to_mpd(pd)->pdn;
+       in.pd = to_mpd(ib_srq->pd)->pdn;
        in.db_record = srq->db.dma;
        err = mlx5_cmd_create_srq(dev, &srq->msrq, &in);
        kvfree(in.pas);
@@ -320,21 +319,18 @@ struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
 
        init_attr->attr.max_wr = srq->msrq.max - 1;
 
-       return &srq->ibsrq;
+       return 0;
 
 err_core:
        mlx5_cmd_destroy_srq(dev, &srq->msrq);
 
 err_usr_kern_srq:
        if (udata)
-               destroy_srq_user(pd, srq);
+               destroy_srq_user(ib_srq->pd, srq, udata);
        else
                destroy_srq_kernel(dev, srq);
 
-err_srq:
-       kfree(srq);
-
-       return ERR_PTR(err);
+       return err;
 }
 
 int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -387,7 +383,7 @@ out_box:
        return ret;
 }
 
-int mlx5_ib_destroy_srq(struct ib_srq *srq)
+void mlx5_ib_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
        struct mlx5_ib_dev *dev = to_mdev(srq->device);
        struct mlx5_ib_srq *msrq = to_msrq(srq);
@@ -395,14 +391,16 @@ int mlx5_ib_destroy_srq(struct ib_srq *srq)
        mlx5_cmd_destroy_srq(dev, &msrq->msrq);
 
        if (srq->uobject) {
-               mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+               mlx5_ib_db_unmap_user(
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mlx5_ib_ucontext,
+                               ibucontext),
+                       &msrq->db);
                ib_umem_release(msrq->umem);
        } else {
                destroy_srq_kernel(dev, msrq);
        }
-
-       kfree(srq);
-       return 0;
 }
 
 void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
index c330af3..af197c3 100644 (file)
@@ -51,15 +51,12 @@ struct mlx5_core_srq {
 
 struct mlx5_srq_table {
        struct notifier_block nb;
-       /* protect radix tree
-        */
-       spinlock_t lock;
-       struct radix_tree_root tree;
+       struct xarray array;
 };
 
 int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
                        struct mlx5_srq_attr *in);
-int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
+void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq);
 int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
                       struct mlx5_srq_attr *out);
 int mlx5_cmd_arm_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
index 63ac38b..b0d0687 100644 (file)
@@ -83,13 +83,11 @@ struct mlx5_core_srq *mlx5_cmd_get_srq(struct mlx5_ib_dev *dev, u32 srqn)
        struct mlx5_srq_table *table = &dev->srq_table;
        struct mlx5_core_srq *srq;
 
-       spin_lock(&table->lock);
-
-       srq = radix_tree_lookup(&table->tree, srqn);
+       xa_lock(&table->array);
+       srq = xa_load(&table->array, srqn);
        if (srq)
                atomic_inc(&srq->common.refcount);
-
-       spin_unlock(&table->lock);
+       xa_unlock(&table->array);
 
        return srq;
 }
@@ -597,9 +595,7 @@ int mlx5_cmd_create_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
        atomic_set(&srq->common.refcount, 1);
        init_completion(&srq->common.free);
 
-       spin_lock_irq(&table->lock);
-       err = radix_tree_insert(&table->tree, srq->srqn, srq);
-       spin_unlock_irq(&table->lock);
+       err = xa_err(xa_store_irq(&table->array, srq->srqn, srq, GFP_KERNEL));
        if (err)
                goto err_destroy_srq_split;
 
@@ -611,26 +607,22 @@ err_destroy_srq_split:
        return err;
 }
 
-int mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
+void mlx5_cmd_destroy_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq)
 {
        struct mlx5_srq_table *table = &dev->srq_table;
        struct mlx5_core_srq *tmp;
        int err;
 
-       spin_lock_irq(&table->lock);
-       tmp = radix_tree_delete(&table->tree, srq->srqn);
-       spin_unlock_irq(&table->lock);
+       tmp = xa_erase_irq(&table->array, srq->srqn);
        if (!tmp || tmp != srq)
-               return -EINVAL;
+               return;
 
        err = destroy_srq_split(dev, srq);
        if (err)
-               return err;
+               return;
 
        mlx5_core_res_put(&srq->common);
        wait_for_completion(&srq->common.free);
-
-       return 0;
 }
 
 int mlx5_cmd_query_srq(struct mlx5_ib_dev *dev, struct mlx5_core_srq *srq,
@@ -680,13 +672,11 @@ static int srq_event_notifier(struct notifier_block *nb,
        eqe = data;
        srqn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
 
-       spin_lock(&table->lock);
-
-       srq = radix_tree_lookup(&table->tree, srqn);
+       xa_lock(&table->array);
+       srq = xa_load(&table->array, srqn);
        if (srq)
                atomic_inc(&srq->common.refcount);
-
-       spin_unlock(&table->lock);
+       xa_unlock(&table->array);
 
        if (!srq)
                return NOTIFY_OK;
@@ -703,8 +693,7 @@ int mlx5_init_srq_table(struct mlx5_ib_dev *dev)
        struct mlx5_srq_table *table = &dev->srq_table;
 
        memset(table, 0, sizeof(*table));
-       spin_lock_init(&table->lock);
-       INIT_RADIX_TREE(&table->tree, GFP_ATOMIC);
+       xa_init_flags(&table->array, XA_FLAGS_LOCK_IRQ);
 
        table->nb.notifier_call = srq_event_notifier;
        mlx5_notifier_register(dev->mdev, &table->nb);
index 877a6da..c3cfea2 100644 (file)
@@ -77,7 +77,7 @@ struct mthca_cq_context {
        __be32 ci_db;           /* Arbel only */
        __be32 state_db;        /* Arbel only */
        u32    reserved;
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_CQ_STATUS_OK          ( 0 << 28)
 #define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
index 30400ea..2cdf686 100644 (file)
@@ -63,7 +63,7 @@ struct mthca_eq_context {
        __be32 consumer_index;
        __be32 producer_index;
        u32    reserved3[4];
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_EQ_STATUS_OK          ( 0 << 28)
 #define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
@@ -130,7 +130,7 @@ struct mthca_eqe {
                u32 raw[6];
                struct {
                        __be32 cqn;
-               } __attribute__((packed)) comp;
+               } __packed comp;
                struct {
                        u16    reserved1;
                        __be16 token;
@@ -138,27 +138,27 @@ struct mthca_eqe {
                        u8     reserved3[3];
                        u8     status;
                        __be64 out_param;
-               } __attribute__((packed)) cmd;
+               } __packed cmd;
                struct {
                        __be32 qpn;
-               } __attribute__((packed)) qp;
+               } __packed qp;
                struct {
                        __be32 srqn;
-               } __attribute__((packed)) srq;
+               } __packed srq;
                struct {
                        __be32 cqn;
                        u32    reserved1;
                        u8     reserved2[3];
                        u8     syndrome;
-               } __attribute__((packed)) cq_err;
+               } __packed cq_err;
                struct {
                        u32    reserved1[2];
                        __be32 port;
-               } __attribute__((packed)) port_change;
+               } __packed port_change;
        } event;
        u8 reserved3[3];
        u8 owner;
-} __attribute__((packed));
+} __packed;
 
 #define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
 #define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
index 6686042..4250b2c 100644 (file)
@@ -60,7 +60,7 @@ struct mthca_mpt_entry {
        __be64 mtt_seg;
        __be32 mtt_sz;          /* Arbel only */
        u32    reserved[2];
-} __attribute__((packed));
+} __packed;
 
 #define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
 #define MTHCA_MPT_FLAG_MIO           (1 << 17)
index d063d7a..4f40dfe 100644 (file)
@@ -363,18 +363,17 @@ static int mthca_mmap_uar(struct ib_ucontext *context,
        return 0;
 }
 
-static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                         struct ib_udata *udata)
+static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct mthca_pd *pd = to_mpd(ibpd);
        int err;
 
-       err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+       err = mthca_pd_alloc(to_mdev(ibdev), !udata, pd);
        if (err)
                return err;
 
-       if (context) {
+       if (udata) {
                if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
                        mthca_pd_free(to_mdev(ibdev), pd);
                        return -EFAULT;
@@ -384,114 +383,86 @@ static int mthca_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        return 0;
 }
 
-static void mthca_dealloc_pd(struct ib_pd *pd)
+static void mthca_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
 }
 
-static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
-                                    struct rdma_ah_attr *ah_attr,
-                                    u32 flags,
-                                    struct ib_udata *udata)
+static int mthca_ah_create(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+                          u32 flags, struct ib_udata *udata)
 
 {
-       int err;
-       struct mthca_ah *ah;
+       struct mthca_ah *ah = to_mah(ibah);
 
-       ah = kmalloc(sizeof *ah, GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
-
-       err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
-       if (err) {
-               kfree(ah);
-               return ERR_PTR(err);
-       }
-
-       return &ah->ibah;
+       return mthca_create_ah(to_mdev(ibah->device), to_mpd(ibah->pd), ah_attr,
+                              ah);
 }
 
-static int mthca_ah_destroy(struct ib_ah *ah, u32 flags)
+static void mthca_ah_destroy(struct ib_ah *ah, u32 flags)
 {
        mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
-       kfree(ah);
-
-       return 0;
 }
 
-static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
-                                      struct ib_srq_init_attr *init_attr,
-                                      struct ib_udata *udata)
+static int mthca_create_srq(struct ib_srq *ibsrq,
+                           struct ib_srq_init_attr *init_attr,
+                           struct ib_udata *udata)
 {
        struct mthca_create_srq ucmd;
        struct mthca_ucontext *context = rdma_udata_to_drv_context(
                udata, struct mthca_ucontext, ibucontext);
-       struct mthca_srq *srq;
+       struct mthca_srq *srq = to_msrq(ibsrq);
        int err;
 
        if (init_attr->srq_type != IB_SRQT_BASIC)
-               return ERR_PTR(-EOPNOTSUPP);
-
-       srq = kmalloc(sizeof *srq, GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
+               return -EOPNOTSUPP;
 
        if (udata) {
-               if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
-                       err = -EFAULT;
-                       goto err_free;
-               }
+               if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd)))
+                       return -EFAULT;
 
-               err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+               err = mthca_map_user_db(to_mdev(ibsrq->device), &context->uar,
                                        context->db_tab, ucmd.db_index,
                                        ucmd.db_page);
 
                if (err)
-                       goto err_free;
+                       return err;
 
                srq->mr.ibmr.lkey = ucmd.lkey;
                srq->db_index     = ucmd.db_index;
        }
 
-       err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+       err = mthca_alloc_srq(to_mdev(ibsrq->device), to_mpd(ibsrq->pd),
                              &init_attr->attr, srq, udata);
 
        if (err && udata)
-               mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+               mthca_unmap_user_db(to_mdev(ibsrq->device), &context->uar,
                                    context->db_tab, ucmd.db_index);
 
        if (err)
-               goto err_free;
+               return err;
 
-       if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
-               mthca_free_srq(to_mdev(pd->device), srq);
-               err = -EFAULT;
-               goto err_free;
+       if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof(__u32))) {
+               mthca_free_srq(to_mdev(ibsrq->device), srq);
+               return -EFAULT;
        }
 
-       return &srq->ibsrq;
-
-err_free:
-       kfree(srq);
-
-       return ERR_PTR(err);
+       return 0;
 }
 
-static int mthca_destroy_srq(struct ib_srq *srq)
+static void mthca_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
-       struct mthca_ucontext *context;
-
-       if (srq->uobject) {
-               context = to_mucontext(srq->uobject->context);
+       if (udata) {
+               struct mthca_ucontext *context =
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mthca_ucontext,
+                               ibucontext);
 
                mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
                                    context->db_tab, to_msrq(srq)->db_index);
        }
 
        mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
-       kfree(srq);
-
-       return 0;
 }
 
 static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
@@ -607,16 +578,22 @@ static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
        return &qp->ibqp;
 }
 
-static int mthca_destroy_qp(struct ib_qp *qp)
+static int mthca_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
-       if (qp->uobject) {
+       if (udata) {
+               struct mthca_ucontext *context =
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mthca_ucontext,
+                               ibucontext);
+
                mthca_unmap_user_db(to_mdev(qp->device),
-                                   &to_mucontext(qp->uobject->context)->uar,
-                                   to_mucontext(qp->uobject->context)->db_tab,
+                                   &context->uar,
+                                   context->db_tab,
                                    to_mqp(qp)->sq.db_index);
                mthca_unmap_user_db(to_mdev(qp->device),
-                                   &to_mucontext(qp->uobject->context)->uar,
-                                   to_mucontext(qp->uobject->context)->db_tab,
+                                   &context->uar,
+                                   context->db_tab,
                                    to_mqp(qp)->rq.db_index);
        }
        mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
@@ -626,7 +603,6 @@ static int mthca_destroy_qp(struct ib_qp *qp)
 
 static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
                                     const struct ib_cq_init_attr *attr,
-                                    struct ib_ucontext *context,
                                     struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -634,6 +610,8 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
        struct mthca_cq *cq;
        int nent;
        int err;
+       struct mthca_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct mthca_ucontext, ibucontext);
 
        if (attr->flags)
                return ERR_PTR(-EINVAL);
@@ -641,19 +619,19 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
        if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
                return ERR_PTR(-EINVAL);
 
-       if (context) {
+       if (udata) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
                        return ERR_PTR(-EFAULT);
 
-               err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-                                       to_mucontext(context)->db_tab,
-                                       ucmd.set_db_index, ucmd.set_db_page);
+               err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
+                                       context->db_tab, ucmd.set_db_index,
+                                       ucmd.set_db_page);
                if (err)
                        return ERR_PTR(err);
 
-               err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-                                       to_mucontext(context)->db_tab,
-                                       ucmd.arm_db_index, ucmd.arm_db_page);
+               err = mthca_map_user_db(to_mdev(ibdev), &context->uar,
+                                       context->db_tab, ucmd.arm_db_index,
+                                       ucmd.arm_db_page);
                if (err)
                        goto err_unmap_set;
        }
@@ -664,7 +642,7 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
                goto err_unmap_arm;
        }
 
-       if (context) {
+       if (udata) {
                cq->buf.mr.ibmr.lkey = ucmd.lkey;
                cq->set_ci_db_index  = ucmd.set_db_index;
                cq->arm_db_index     = ucmd.arm_db_index;
@@ -673,14 +651,13 @@ static struct ib_cq *mthca_create_cq(struct ib_device *ibdev,
        for (nent = 1; nent <= entries; nent <<= 1)
                ; /* nothing */
 
-       err = mthca_init_cq(to_mdev(ibdev), nent,
-                           context ? to_mucontext(context) : NULL,
-                           context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+       err = mthca_init_cq(to_mdev(ibdev), nent, context,
+                           udata ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
                            cq);
        if (err)
                goto err_free;
 
-       if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+       if (udata && ib_copy_to_udata(udata, &cq->cqn, sizeof(__u32))) {
                mthca_free_cq(to_mdev(ibdev), cq);
                err = -EFAULT;
                goto err_free;
@@ -694,14 +671,14 @@ err_free:
        kfree(cq);
 
 err_unmap_arm:
-       if (context)
-               mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-                                   to_mucontext(context)->db_tab, ucmd.arm_db_index);
+       if (udata)
+               mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
+                                   context->db_tab, ucmd.arm_db_index);
 
 err_unmap_set:
-       if (context)
-               mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
-                                   to_mucontext(context)->db_tab, ucmd.set_db_index);
+       if (udata)
+               mthca_unmap_user_db(to_mdev(ibdev), &context->uar,
+                                   context->db_tab, ucmd.set_db_index);
 
        return ERR_PTR(err);
 }
@@ -827,16 +804,22 @@ out:
        return ret;
 }
 
-static int mthca_destroy_cq(struct ib_cq *cq)
+static int mthca_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
-       if (cq->uobject) {
+       if (udata) {
+               struct mthca_ucontext *context =
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct mthca_ucontext,
+                               ibucontext);
+
                mthca_unmap_user_db(to_mdev(cq->device),
-                                   &to_mucontext(cq->uobject->context)->uar,
-                                   to_mucontext(cq->uobject->context)->db_tab,
+                                   &context->uar,
+                                   context->db_tab,
                                    to_mcq(cq)->arm_db_index);
                mthca_unmap_user_db(to_mdev(cq->device),
-                                   &to_mucontext(cq->uobject->context)->uar,
-                                   to_mucontext(cq->uobject->context)->db_tab,
+                                   &context->uar,
+                                   context->db_tab,
                                    to_mcq(cq)->set_ci_db_index);
        }
        mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
@@ -914,7 +897,7 @@ static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                goto err;
        }
 
-       n = mr->umem->nmap;
+       n = ib_umem_num_pages(mr->umem);
 
        mr->mtt = mthca_alloc_mtt(dev, n);
        if (IS_ERR(mr->mtt)) {
@@ -974,7 +957,7 @@ err:
        return ERR_PTR(err);
 }
 
-static int mthca_dereg_mr(struct ib_mr *mr)
+static int mthca_dereg_mr(struct ib_mr *mr, struct ib_udata *udata)
 {
        struct mthca_mr *mmr = to_mmr(mr);
 
@@ -1200,6 +1183,8 @@ static const struct ib_device_ops mthca_dev_ops = {
        .query_qp = mthca_query_qp,
        .reg_user_mr = mthca_reg_user_mr,
        .resize_cq = mthca_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, mthca_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, mthca_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, mthca_ucontext, ibucontext),
 };
@@ -1210,6 +1195,8 @@ static const struct ib_device_ops mthca_dev_arbel_srq_ops = {
        .modify_srq = mthca_modify_srq,
        .post_srq_recv = mthca_arbel_post_srq_recv,
        .query_srq = mthca_query_srq,
+
+       INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq),
 };
 
 static const struct ib_device_ops mthca_dev_tavor_srq_ops = {
@@ -1218,6 +1205,8 @@ static const struct ib_device_ops mthca_dev_tavor_srq_ops = {
        .modify_srq = mthca_modify_srq,
        .post_srq_recv = mthca_tavor_post_srq_recv,
        .query_srq = mthca_query_srq,
+
+       INIT_RDMA_OBJ_SIZE(ib_srq, mthca_srq, ibsrq),
 };
 
 static const struct ib_device_ops mthca_dev_arbel_fmr_ops = {
index d65b189..d04c245 100644 (file)
@@ -115,7 +115,7 @@ struct mthca_qp_path {
        u8     hop_limit;
        __be32 sl_tclass_flowlabel;
        u8     rgid[16];
-} __attribute__((packed));
+} __packed;
 
 struct mthca_qp_context {
        __be32 flags;
@@ -154,14 +154,14 @@ struct mthca_qp_context {
        __be16 rq_wqe_counter;  /* reserved on Tavor */
        __be16 sq_wqe_counter;  /* reserved on Tavor */
        u32    reserved3[18];
-} __attribute__((packed));
+} __packed;
 
 struct mthca_qp_param {
        __be32 opt_param_mask;
        u32    reserved1;
        struct mthca_qp_context context;
        u32    reserved2[62];
-} __attribute__((packed));
+} __packed;
 
 enum {
        MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
index 0010a3e..62bf986 100644 (file)
@@ -3033,7 +3033,8 @@ static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
                /* Need to free the Last Streaming Mode Message */
                if (nesqp->ietf_frame) {
                        if (nesqp->lsmm_mr)
-                               nesibdev->ibdev.ops.dereg_mr(nesqp->lsmm_mr);
+                               nesibdev->ibdev.ops.dereg_mr(nesqp->lsmm_mr,
+                                                            NULL);
                        pci_free_consistent(nesdev->pcidev,
                                            nesqp->private_data_len + nesqp->ietf_frame_size,
                                            nesqp->ietf_frame, nesqp->ietf_frame_pbase);
index 828e4af..4902432 100644 (file)
@@ -52,7 +52,7 @@ atomic_t qps_created;
 atomic_t sw_qps_destroyed;
 
 static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
-static int nes_dereg_mr(struct ib_mr *ib_mr);
+static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 
 /**
  * nes_alloc_mw
@@ -306,9 +306,8 @@ static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
 /*
  * nes_alloc_mr
  */
-static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
-                                 enum ib_mr_type mr_type,
-                                 u32 max_num_sg)
+static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+                                 u32 max_num_sg, struct ib_udata *udata)
 {
        struct nes_pd *nespd = to_nespd(ibpd);
        struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
@@ -386,7 +385,7 @@ static struct ib_mr *nes_alloc_mr(struct ib_pd *ibpd,
        return ibmr;
 
 err:
-       nes_dereg_mr(ibmr);
+       nes_dereg_mr(ibmr, udata);
 
        return ERR_PTR(-ENOMEM);
 }
@@ -641,22 +640,24 @@ static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 /**
  * nes_alloc_pd
  */
-static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                       struct ib_udata *udata)
+static int nes_alloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = pd->device;
        struct nes_pd *nespd = to_nespd(pd);
        struct nes_vnic *nesvnic = to_nesvnic(ibdev);
        struct nes_device *nesdev = nesvnic->nesdev;
        struct nes_adapter *nesadapter = nesdev->nesadapter;
-       struct nes_ucontext *nesucontext;
        struct nes_alloc_pd_resp uresp;
        u32 pd_num = 0;
        int err;
+       struct nes_ucontext *nesucontext = rdma_udata_to_drv_context(
+               udata, struct nes_ucontext, ibucontext);
 
-       nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
-                       nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
-                       netdev_refcnt_read(nesvnic->netdev));
+       nes_debug(
+               NES_DBG_PD,
+               "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
+               nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev,
+               &nesucontext->ibucontext, netdev_refcnt_read(nesvnic->netdev));
 
        err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
                        nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
@@ -668,8 +669,7 @@ static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
 
        nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
 
-       if (context) {
-               nesucontext = to_nesucontext(context);
+       if (udata) {
                nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
                                NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
                nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
@@ -700,7 +700,7 @@ static int nes_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
 /**
  * nes_dealloc_pd
  */
-static void nes_dealloc_pd(struct ib_pd *ibpd)
+static void nes_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct nes_ucontext *nesucontext;
        struct nes_pd *nespd = to_nespd(ibpd);
@@ -708,8 +708,12 @@ static void nes_dealloc_pd(struct ib_pd *ibpd)
        struct nes_device *nesdev = nesvnic->nesdev;
        struct nes_adapter *nesadapter = nesdev->nesadapter;
 
-       if ((ibpd->uobject) && (ibpd->uobject->context)) {
-               nesucontext = to_nesucontext(ibpd->uobject->context);
+       if (udata) {
+               nesucontext =
+                       rdma_udata_to_drv_context(
+                               udata,
+                               struct nes_ucontext,
+                               ibucontext);
                nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
                                nespd->mmap_db_index);
                clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
@@ -1039,53 +1043,48 @@ static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
                                }
                                if (req.user_qp_buffer)
                                        nesqp->nesuqp_addr = req.user_qp_buffer;
-                               if (udata) {
-                                       nesqp->user_mode = 1;
-                                       if (virt_wqs) {
-                                               err = 1;
-                                               list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
-                                                       if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
-                                                               list_del(&nespbl->list);
-                                                               err = 0;
-                                                               nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
-                                                                         nespbl, nespbl->user_base);
-                                                               break;
-                                                       }
-                                               }
-                                               if (err) {
-                                                       nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
-                                                                 (long long unsigned int)req.user_wqe_buffers);
-                                                       nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                                       kfree(nesqp->allocated_buffer);
-                                                       return ERR_PTR(-EFAULT);
+
+                               nesqp->user_mode = 1;
+                               if (virt_wqs) {
+                                       err = 1;
+                                       list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
+                                               if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
+                                                       list_del(&nespbl->list);
+                                                       err = 0;
+                                                       nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
+                                                                 nespbl, nespbl->user_base);
+                                                       break;
                                                }
                                        }
-
-                                       nesqp->mmap_sq_db_index =
-                                               find_next_zero_bit(nes_ucontext->allocated_wqs,
-                                                                  NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
-                                       /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
-                                                       nespd->mmap_db_index); */
-                                       if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
-                                               nes_debug(NES_DBG_QP,
-                                                         "db index > max user regions, failing create QP\n");
+                                       if (err) {
+                                               nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
+                                                         (long long unsigned int)req.user_wqe_buffers);
                                                nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
-                                               if (virt_wqs) {
-                                                       pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
-                                                                           nespbl->pbl_pbase);
-                                                       kfree(nespbl);
-                                               }
                                                kfree(nesqp->allocated_buffer);
-                                               return ERR_PTR(-ENOMEM);
+                                               return ERR_PTR(-EFAULT);
                                        }
-                                       set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
-                                       nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
-                                       nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
-                               } else {
+                               }
+
+                               nesqp->mmap_sq_db_index =
+                                       find_next_zero_bit(nes_ucontext->allocated_wqs,
+                                                          NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
+                               /* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
+                                               nespd->mmap_db_index); */
+                               if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
+                                       nes_debug(NES_DBG_QP,
+                                                 "db index > max user regions, failing create QP\n");
                                        nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+                                       if (virt_wqs) {
+                                               pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+                                                                   nespbl->pbl_pbase);
+                                               kfree(nespbl);
+                                       }
                                        kfree(nesqp->allocated_buffer);
-                                       return ERR_PTR(-EFAULT);
+                                       return ERR_PTR(-ENOMEM);
                                }
+                               set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+                               nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
+                               nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
                        }
                        err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
                                        nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
@@ -1303,7 +1302,7 @@ static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
 /**
  * nes_destroy_qp
  */
-static int nes_destroy_qp(struct ib_qp *ibqp)
+static int nes_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct nes_qp *nesqp = to_nesqp(ibqp);
        struct nes_ucontext *nes_ucontext;
@@ -1343,8 +1342,12 @@ static int nes_destroy_qp(struct ib_qp *ibqp)
        }
 
        if (nesqp->user_mode) {
-               if ((ibqp->uobject)&&(ibqp->uobject->context)) {
-                       nes_ucontext = to_nesucontext(ibqp->uobject->context);
+               if (udata) {
+                       nes_ucontext =
+                               rdma_udata_to_drv_context(
+                                       udata,
+                                       struct nes_ucontext,
+                                       ibucontext);
                        clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
                        nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
                        if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
@@ -1373,7 +1376,6 @@ static int nes_destroy_qp(struct ib_qp *ibqp)
  */
 static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
                                   const struct ib_cq_init_attr *attr,
-                                  struct ib_ucontext *context,
                                   struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -1418,9 +1420,10 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
        nescq->hw_cq.cq_number = cq_num;
        nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
 
+       if (udata) {
+               struct nes_ucontext *nes_ucontext = rdma_udata_to_drv_context(
+                       udata, struct nes_ucontext, ibucontext);
 
-       if (context) {
-               nes_ucontext = to_nesucontext(context);
                if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
                        nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
                        kfree(nescq);
@@ -1487,7 +1490,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
        cqp_request = nes_get_cqp_request(nesdev);
        if (cqp_request == NULL) {
                nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
-               if (!context)
+               if (!udata)
                        pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
                                        nescq->hw_cq.cq_pbase);
                else {
@@ -1516,7 +1519,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
                        if (nesadapter->free_4kpbl == 0) {
                                spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
                                nes_free_cqp_request(nesdev, cqp_request);
-                               if (!context)
+                               if (!udata)
                                        pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
                                                        nescq->hw_cq.cq_pbase);
                                else {
@@ -1538,7 +1541,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
                        if (nesadapter->free_256pbl == 0) {
                                spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
                                nes_free_cqp_request(nesdev, cqp_request);
-                               if (!context)
+                               if (!udata)
                                        pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
                                                        nescq->hw_cq.cq_pbase);
                                else {
@@ -1564,7 +1567,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
        set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
                        (nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
 
-       if (context) {
+       if (udata) {
                if (pbl_entries != 1)
                        u64temp = (u64)nespbl->pbl_pbase;
                else
@@ -1595,7 +1598,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
                        nescq->hw_cq.cq_number, ret);
        if ((!ret) || (cqp_request->major_code)) {
                nes_put_cqp_request(nesdev, cqp_request);
-               if (!context)
+               if (!udata)
                        pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
                                        nescq->hw_cq.cq_pbase);
                else {
@@ -1609,7 +1612,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
        }
        nes_put_cqp_request(nesdev, cqp_request);
 
-       if (context) {
+       if (udata) {
                /* free the nespbl */
                pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
                                nespbl->pbl_pbase);
@@ -1631,7 +1634,7 @@ static struct ib_cq *nes_create_cq(struct ib_device *ibdev,
 /**
  * nes_destroy_cq
  */
-static int nes_destroy_cq(struct ib_cq *ib_cq)
+static int nes_destroy_cq(struct ib_cq *ib_cq, struct ib_udata *udata)
 {
        struct nes_cq *nescq;
        struct nes_device *nesdev;
@@ -2382,7 +2385,7 @@ reg_user_mr_err:
 /**
  * nes_dereg_mr
  */
-static int nes_dereg_mr(struct ib_mr *ib_mr)
+static int nes_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct nes_mr *nesmr = to_nesmr(ib_mr);
        struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
@@ -3574,6 +3577,14 @@ static const struct ib_device_ops nes_dev_ops = {
        .get_dev_fw_str = get_dev_fw_str,
        .get_dma_mr = nes_get_dma_mr,
        .get_port_immutable = nes_port_immutable,
+       .iw_accept = nes_accept,
+       .iw_add_ref = nes_add_ref,
+       .iw_connect = nes_connect,
+       .iw_create_listen = nes_create_listen,
+       .iw_destroy_listen = nes_destroy_listen,
+       .iw_get_qp = nes_get_qp,
+       .iw_reject = nes_reject,
+       .iw_rem_ref = nes_rem_ref,
        .map_mr_sg = nes_map_mr_sg,
        .mmap = nes_mmap,
        .modify_qp = nes_modify_qp,
@@ -3638,23 +3649,9 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
        nesibdev->ibdev.num_comp_vectors = 1;
        nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
 
-       nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
-       if (nesibdev->ibdev.iwcm == NULL) {
-               ib_dealloc_device(&nesibdev->ibdev);
-               return NULL;
-       }
-       nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
-       nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
-       nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
-       nesibdev->ibdev.iwcm->connect = nes_connect;
-       nesibdev->ibdev.iwcm->accept = nes_accept;
-       nesibdev->ibdev.iwcm->reject = nes_reject;
-       nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
-       nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
-
        ib_set_device_ops(&nesibdev->ibdev, &nes_dev_ops);
-       memcpy(nesibdev->ibdev.iwcm->ifname, netdev->name,
-              sizeof(nesibdev->ibdev.iwcm->ifname));
+       memcpy(nesibdev->ibdev.iw_ifname, netdev->name,
+              sizeof(nesibdev->ibdev.iw_ifname));
 
        return nesibdev;
 }
@@ -3715,7 +3712,6 @@ void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
 
        nes_unregister_ofa_device(nesibdev);
 
-       kfree(nesibdev->ibdev.iwcm);
        ib_dealloc_device(&nesibdev->ibdev);
 }
 
index a729532..1d4ea13 100644 (file)
@@ -156,37 +156,34 @@ static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
        return status;
 }
 
-struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-                              u32 flags, struct ib_udata *udata)
+int ocrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+                    struct ib_udata *udata)
 {
        u32 *ahid_addr;
        int status;
-       struct ocrdma_ah *ah;
+       struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
        bool isvlan = false;
        u16 vlan_tag = 0xffff;
        const struct ib_gid_attr *sgid_attr;
-       struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-       struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+       struct ocrdma_pd *pd = get_ocrdma_pd(ibah->pd);
+       struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
        if ((attr->type != RDMA_AH_ATTR_TYPE_ROCE) ||
            !(rdma_ah_get_ah_flags(attr) & IB_AH_GRH))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (atomic_cmpxchg(&dev->update_sl, 1, 0))
                ocrdma_init_service_level(dev);
 
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
+       sgid_attr = attr->grh.sgid_attr;
+       status = rdma_read_gid_l2_fields(sgid_attr, &vlan_tag, NULL);
+       if (status)
+               return status;
 
        status = ocrdma_alloc_av(dev, ah);
        if (status)
                goto av_err;
 
-       sgid_attr = attr->grh.sgid_attr;
-       if (is_vlan_dev(sgid_attr->ndev))
-               vlan_tag = vlan_dev_vlan_id(sgid_attr->ndev);
-
        /* Get network header type for this GID */
        ah->hdr_type = rdma_gid_attr_network_type(sgid_attr);
 
@@ -210,23 +207,20 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
                                       OCRDMA_AH_VLAN_VALID_SHIFT);
        }
 
-       return &ah->ibah;
+       return 0;
 
 av_conf_err:
        ocrdma_free_av(dev, ah);
 av_err:
-       kfree(ah);
-       return ERR_PTR(status);
+       return status;
 }
 
-int ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
+void ocrdma_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
        struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
        struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
 
        ocrdma_free_av(dev, ah);
-       kfree(ah);
-       return 0;
 }
 
 int ocrdma_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
index eb996e1..64cb82c 100644 (file)
@@ -51,9 +51,9 @@ enum {
        OCRDMA_AH_L3_TYPE_SHIFT         = 0x1D /* 29 bits */
 };
 
-struct ib_ah *ocrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                              u32 flags, struct ib_udata *udata);
-int ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
+int ocrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+                    struct ib_udata *udata);
+void ocrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 int ocrdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
 
 int ocrdma_process_mad(struct ib_device *,
index 097e5ab..32674b2 100644 (file)
@@ -2496,7 +2496,7 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
        int status;
        struct rdma_ah_attr *ah_attr = &attrs->ah_attr;
        const struct ib_gid_attr *sgid_attr;
-       u32 vlan_id = 0xFFFF;
+       u16 vlan_id = 0xFFFF;
        u8 mac_addr[6], hdr_type;
        union {
                struct sockaddr     _sockaddr;
@@ -2526,8 +2526,9 @@ static int ocrdma_set_av_params(struct ocrdma_qp *qp,
               sizeof(cmd->params.dgid));
 
        sgid_attr = ah_attr->grh.sgid_attr;
-       vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev);
-       memcpy(mac_addr, sgid_attr->ndev->dev_addr, ETH_ALEN);
+       status = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, &mac_addr[0]);
+       if (status)
+               return status;
 
        qp->sgid_idx = grh->sgid_index;
        memcpy(&cmd->params.sgid[0], &sgid_attr->gid.raw[0],
@@ -2863,21 +2864,19 @@ int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
        return status;
 }
 
-int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
+void ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
 {
-       int status = -ENOMEM;
        struct ocrdma_destroy_srq *cmd;
        struct pci_dev *pdev = dev->nic_info.pdev;
        cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd));
        if (!cmd)
-               return status;
+               return;
        cmd->id = srq->id;
-       status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+       ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
        if (srq->rq.va)
                dma_free_coherent(&pdev->dev, srq->rq.len,
                                  srq->rq.va, srq->rq.pa);
        kfree(cmd);
-       return status;
 }
 
 static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
@@ -3067,13 +3066,12 @@ int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
        return status;
 }
 
-int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+void ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
 {
        unsigned long flags;
        spin_lock_irqsave(&dev->av_tbl.lock, flags);
        ah->av->valid = 0;
        spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
-       return 0;
 }
 
 static int ocrdma_create_eqs(struct ocrdma_dev *dev)
index ebc1f44..06ec593 100644 (file)
@@ -137,10 +137,10 @@ int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *,
                          struct ocrdma_pd *);
 int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
 int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *);
-int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
+void ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq);
 
-int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
-int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
+int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah);
+void ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah);
 
 int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
                            enum ib_qp_state *old_ib_state);
index b9e10d5..fc6c096 100644 (file)
@@ -62,8 +62,6 @@ MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION);
 MODULE_AUTHOR("Emulex Corporation");
 MODULE_LICENSE("Dual BSD/GPL");
 
-static DEFINE_IDR(ocrdma_dev_id);
-
 void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
 {
        u8 mac_addr[6];
@@ -161,7 +159,6 @@ static const struct ib_device_ops ocrdma_dev_ops = {
        .get_dev_fw_str = get_dev_fw_str,
        .get_dma_mr = ocrdma_get_dma_mr,
        .get_link_layer = ocrdma_link_layer,
-       .get_netdev = ocrdma_get_netdev,
        .get_port_immutable = ocrdma_port_immutable,
        .map_mr_sg = ocrdma_map_mr_sg,
        .mmap = ocrdma_mmap,
@@ -179,6 +176,8 @@ static const struct ib_device_ops ocrdma_dev_ops = {
        .reg_user_mr = ocrdma_reg_user_mr,
        .req_notify_cq = ocrdma_arm_cq,
        .resize_cq = ocrdma_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, ocrdma_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, ocrdma_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, ocrdma_ucontext, ibucontext),
 };
@@ -189,10 +188,14 @@ static const struct ib_device_ops ocrdma_dev_srq_ops = {
        .modify_srq = ocrdma_modify_srq,
        .post_srq_recv = ocrdma_post_srq_recv,
        .query_srq = ocrdma_query_srq,
+
+       INIT_RDMA_OBJ_SIZE(ib_srq, ocrdma_srq, ibsrq),
 };
 
 static int ocrdma_register_device(struct ocrdma_dev *dev)
 {
+       int ret;
+
        ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
        BUILD_BUG_ON(sizeof(OCRDMA_NODE_DESC) > IB_DEVICE_NODE_DESC_MAX);
        memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
@@ -247,6 +250,10 @@ static int ocrdma_register_device(struct ocrdma_dev *dev)
        }
        rdma_set_device_sysfs_group(&dev->ibdev, &ocrdma_attr_group);
        dev->ibdev.driver_id = RDMA_DRIVER_OCRDMA;
+       ret = ib_device_set_netdev(&dev->ibdev, dev->nic_info.netdev, 1);
+       if (ret)
+               return ret;
+
        return ib_register_device(&dev->ibdev, "ocrdma%d");
 }
 
@@ -304,15 +311,13 @@ static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
                pr_err("Unable to allocate ib device\n");
                return NULL;
        }
+
        dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL);
        if (!dev->mbx_cmd)
-               goto idr_err;
+               goto init_err;
 
        memcpy(&dev->nic_info, dev_info, sizeof(*dev_info));
-       dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL);
-       if (dev->id < 0)
-               goto idr_err;
-
+       dev->id = PCI_FUNC(dev->nic_info.pdev->devfn);
        status = ocrdma_init_hw(dev);
        if (status)
                goto init_err;
@@ -349,8 +354,6 @@ alloc_err:
        ocrdma_free_resources(dev);
        ocrdma_cleanup_hw(dev);
 init_err:
-       idr_remove(&ocrdma_dev_id, dev->id);
-idr_err:
        kfree(dev->mbx_cmd);
        ib_dealloc_device(&dev->ibdev);
        pr_err("%s() leaving. ret=%d\n", __func__, status);
@@ -360,7 +363,6 @@ idr_err:
 static void ocrdma_remove_free(struct ocrdma_dev *dev)
 {
 
-       idr_remove(&ocrdma_dev_id, dev->id);
        kfree(dev->mbx_cmd);
        ib_dealloc_device(&dev->ibdev);
 }
@@ -465,7 +467,6 @@ static void __exit ocrdma_exit_module(void)
 {
        be_roce_unregister_driver(&ocrdma_drv);
        ocrdma_rem_debugfs();
-       idr_destroy(&ocrdma_dev_id);
 }
 
 module_init(ocrdma_init_module);
index b4e1777..35ec870 100644 (file)
@@ -47,6 +47,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "ocrdma.h"
 #include "ocrdma_hw.h"
@@ -112,24 +113,6 @@ int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr,
        return 0;
 }
 
-struct net_device *ocrdma_get_netdev(struct ib_device *ibdev, u8 port_num)
-{
-       struct ocrdma_dev *dev;
-       struct net_device *ndev = NULL;
-
-       rcu_read_lock();
-
-       dev = get_ocrdma_dev(ibdev);
-       if (dev)
-               ndev = dev->nic_info.netdev;
-       if (ndev)
-               dev_hold(ndev);
-
-       rcu_read_unlock();
-
-       return ndev;
-}
-
 static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
                                            u8 *ib_speed, u8 *ib_width)
 {
@@ -367,6 +350,16 @@ static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
        return status;
 }
 
+/*
+ * NOTE:
+ *
+ * ocrdma_ucontext must be used here because this function is also
+ * called from ocrdma_alloc_ucontext where ib_udata does not have
+ * valid ib_ucontext pointer. ib_uverbs_get_context does not call
+ * uobj_{alloc|get_xxx} helpers which are used to store the
+ * ib_ucontext in uverbs_attr_bundle wrapping the ib_udata. so
+ * ib_udata does NOT imply valid ib_ucontext here!
+ */
 static int _ocrdma_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
                            struct ocrdma_ucontext *uctx,
                            struct ib_udata *udata)
@@ -593,7 +586,6 @@ int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 }
 
 static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
-                               struct ib_ucontext *ib_ctx,
                                struct ib_udata *udata)
 {
        int status;
@@ -601,7 +593,8 @@ static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
        u64 dpp_page_addr = 0;
        u32 db_page_size;
        struct ocrdma_alloc_pd_uresp rsp;
-       struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+       struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+               udata, struct ocrdma_ucontext, ibucontext);
 
        memset(&rsp, 0, sizeof(rsp));
        rsp.id = pd->id;
@@ -639,18 +632,17 @@ dpp_map_err:
        return status;
 }
 
-int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                   struct ib_udata *udata)
+int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
        struct ocrdma_pd *pd;
-       struct ocrdma_ucontext *uctx = NULL;
        int status;
        u8 is_uctx_pd = false;
+       struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+               udata, struct ocrdma_ucontext, ibucontext);
 
-       if (udata && context) {
-               uctx = get_ocrdma_ucontext(context);
+       if (udata) {
                pd = ocrdma_get_ucontext_pd(uctx);
                if (pd) {
                        is_uctx_pd = true;
@@ -664,8 +656,8 @@ int ocrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
                goto exit;
 
 pd_mapping:
-       if (udata && context) {
-               status = ocrdma_copy_pd_uresp(dev, pd, context, udata);
+       if (udata) {
+               status = ocrdma_copy_pd_uresp(dev, pd, udata);
                if (status)
                        goto err;
        }
@@ -680,7 +672,7 @@ exit:
        return status;
 }
 
-void ocrdma_dealloc_pd(struct ib_pd *ibpd)
+void ocrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
        struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
@@ -922,7 +914,7 @@ umem_err:
        return ERR_PTR(status);
 }
 
-int ocrdma_dereg_mr(struct ib_mr *ib_mr)
+int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr);
        struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device);
@@ -946,13 +938,17 @@ int ocrdma_dereg_mr(struct ib_mr *ib_mr)
 }
 
 static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
-                               struct ib_udata *udata,
-                               struct ib_ucontext *ib_ctx)
+                               struct ib_udata *udata)
 {
        int status;
-       struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+       struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+               udata, struct ocrdma_ucontext, ibucontext);
        struct ocrdma_create_cq_uresp uresp;
 
+       /* this must be user flow! */
+       if (!udata)
+               return -EINVAL;
+
        memset(&uresp, 0, sizeof(uresp));
        uresp.cq_id = cq->id;
        uresp.page_size = PAGE_ALIGN(cq->len);
@@ -983,13 +979,13 @@ err:
 
 struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
                               const struct ib_cq_init_attr *attr,
-                              struct ib_ucontext *ib_ctx,
                               struct ib_udata *udata)
 {
        int entries = attr->cqe;
        struct ocrdma_cq *cq;
        struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
-       struct ocrdma_ucontext *uctx = NULL;
+       struct ocrdma_ucontext *uctx = rdma_udata_to_drv_context(
+               udata, struct ocrdma_ucontext, ibucontext);
        u16 pd_id = 0;
        int status;
        struct ocrdma_create_cq_ureq ureq;
@@ -1011,18 +1007,16 @@ struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
        INIT_LIST_HEAD(&cq->sq_head);
        INIT_LIST_HEAD(&cq->rq_head);
 
-       if (ib_ctx) {
-               uctx = get_ocrdma_ucontext(ib_ctx);
+       if (udata)
                pd_id = uctx->cntxt_pd->id;
-       }
 
        status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
        if (status) {
                kfree(cq);
                return ERR_PTR(status);
        }
-       if (ib_ctx) {
-               status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx);
+       if (udata) {
+               status = ocrdma_copy_cq_uresp(dev, cq, udata);
                if (status)
                        goto ctx_err;
        }
@@ -1076,7 +1070,7 @@ static void ocrdma_flush_cq(struct ocrdma_cq *cq)
        spin_unlock_irqrestore(&cq->cq_lock, flags);
 }
 
-int ocrdma_destroy_cq(struct ib_cq *ibcq)
+int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
        struct ocrdma_eq *eq = NULL;
@@ -1697,7 +1691,7 @@ void ocrdma_del_flush_qp(struct ocrdma_qp *qp)
        spin_unlock_irqrestore(&dev->flush_q_lock, flags);
 }
 
-int ocrdma_destroy_qp(struct ib_qp *ibqp)
+int ocrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct ocrdma_pd *pd;
        struct ocrdma_qp *qp;
@@ -1793,45 +1787,43 @@ static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
        return status;
 }
 
-struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
-                                struct ib_srq_init_attr *init_attr,
-                                struct ib_udata *udata)
+int ocrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+                     struct ib_udata *udata)
 {
-       int status = -ENOMEM;
-       struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
-       struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
-       struct ocrdma_srq *srq;
+       int status;
+       struct ocrdma_pd *pd = get_ocrdma_pd(ibsrq->pd);
+       struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
+       struct ocrdma_srq *srq = get_ocrdma_srq(ibsrq);
 
        if (init_attr->attr.max_sge > dev->attr.max_recv_sge)
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        if (init_attr->attr.max_wr > dev->attr.max_rqe)
-               return ERR_PTR(-EINVAL);
-
-       srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(status);
+               return -EINVAL;
 
        spin_lock_init(&srq->q_lock);
        srq->pd = pd;
        srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size);
        status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd);
        if (status)
-               goto err;
+               return status;
 
-       if (udata == NULL) {
-               status = -ENOMEM;
+       if (!udata) {
                srq->rqe_wr_id_tbl = kcalloc(srq->rq.max_cnt, sizeof(u64),
                                             GFP_KERNEL);
-               if (srq->rqe_wr_id_tbl == NULL)
+               if (!srq->rqe_wr_id_tbl) {
+                       status = -ENOMEM;
                        goto arm_err;
+               }
 
                srq->bit_fields_len = (srq->rq.max_cnt / 32) +
                    (srq->rq.max_cnt % 32 ? 1 : 0);
                srq->idx_bit_fields =
                    kmalloc_array(srq->bit_fields_len, sizeof(u32),
                                  GFP_KERNEL);
-               if (srq->idx_bit_fields == NULL)
+               if (!srq->idx_bit_fields) {
+                       status = -ENOMEM;
                        goto arm_err;
+               }
                memset(srq->idx_bit_fields, 0xff,
                       srq->bit_fields_len * sizeof(u32));
        }
@@ -1848,15 +1840,13 @@ struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
                        goto arm_err;
        }
 
-       return &srq->ibsrq;
+       return 0;
 
 arm_err:
        ocrdma_mbx_destroy_srq(dev, srq);
-err:
        kfree(srq->rqe_wr_id_tbl);
        kfree(srq->idx_bit_fields);
-       kfree(srq);
-       return ERR_PTR(status);
+       return status;
 }
 
 int ocrdma_modify_srq(struct ib_srq *ibsrq,
@@ -1885,15 +1875,14 @@ int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
        return status;
 }
 
-int ocrdma_destroy_srq(struct ib_srq *ibsrq)
+void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
-       int status;
        struct ocrdma_srq *srq;
        struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
 
        srq = get_ocrdma_srq(ibsrq);
 
-       status = ocrdma_mbx_destroy_srq(dev, srq);
+       ocrdma_mbx_destroy_srq(dev, srq);
 
        if (srq->pd->uctx)
                ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
@@ -1901,8 +1890,6 @@ int ocrdma_destroy_srq(struct ib_srq *ibsrq)
 
        kfree(srq->idx_bit_fields);
        kfree(srq->rqe_wr_id_tbl);
-       kfree(srq);
-       return status;
 }
 
 /* unprivileged verbs and their support functions. */
@@ -2931,9 +2918,8 @@ int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
        return 0;
 }
 
-struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd,
-                             enum ib_mr_type mr_type,
-                             u32 max_num_sg)
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+                             u32 max_num_sg, struct ib_udata *udata)
 {
        int status;
        struct ocrdma_mr *mr;
index 4c04ab4..d76aae7 100644 (file)
@@ -61,7 +61,6 @@ enum rdma_protocol_type
 ocrdma_query_protocol(struct ib_device *device, u8 port_num);
 
 void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
-struct net_device *ocrdma_get_netdev(struct ib_device *device, u8 port_num);
 int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
 
 int ocrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
@@ -69,16 +68,14 @@ void ocrdma_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
 
-int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
-                   struct ib_udata *udata);
-void ocrdma_dealloc_pd(struct ib_pd *pd);
+int ocrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void ocrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev,
                               const struct ib_cq_init_attr *attr,
-                              struct ib_ucontext *ib_ctx,
                               struct ib_udata *udata);
 int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int ocrdma_destroy_cq(struct ib_cq *);
+int ocrdma_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 
 struct ib_qp *ocrdma_create_qp(struct ib_pd *,
                               struct ib_qp_init_attr *attrs,
@@ -90,25 +87,24 @@ int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
 int ocrdma_query_qp(struct ib_qp *,
                    struct ib_qp_attr *qp_attr,
                    int qp_attr_mask, struct ib_qp_init_attr *);
-int ocrdma_destroy_qp(struct ib_qp *);
+int ocrdma_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
 
-struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
-                                struct ib_udata *);
+int ocrdma_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *attr,
+                     struct ib_udata *udata);
 int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
                      enum ib_srq_attr_mask, struct ib_udata *);
 int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
-int ocrdma_destroy_srq(struct ib_srq *);
+void ocrdma_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int ocrdma_post_srq_recv(struct ib_srq *, const struct ib_recv_wr *,
                         const struct ib_recv_wr **bad_recv_wr);
 
-int ocrdma_dereg_mr(struct ib_mr *);
+int ocrdma_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
 struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
                                 u64 virt, int acc, struct ib_udata *);
-struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd,
-                             enum ib_mr_type mr_type,
-                             u32 max_num_sg);
+struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                             u32 max_num_sg, struct ib_udata *udata);
 int ocrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
                     unsigned int *sg_offset);
 
index 996d9ec..083c2c0 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/iommu.h>
 #include <linux/pci.h>
 #include <net/addrconf.h>
-#include <linux/idr.h>
 
 #include <linux/qed/qed_chain.h>
 #include <linux/qed/qed_if.h>
@@ -82,20 +81,6 @@ static void qedr_get_dev_fw_str(struct ib_device *ibdev, char *str)
                 (fw_ver >> 8) & 0xFF, fw_ver & 0xFF);
 }
 
-static struct net_device *qedr_get_netdev(struct ib_device *dev, u8 port_num)
-{
-       struct qedr_dev *qdev;
-
-       qdev = get_qedr_dev(dev);
-       dev_hold(qdev->ndev);
-
-       /* The HW vendor's device driver must guarantee
-        * that this function returns NULL before the net device has finished
-        * NETDEV_UNREGISTER state.
-        */
-       return qdev->ndev;
-}
-
 static int qedr_roce_port_immutable(struct ib_device *ibdev, u8 port_num,
                                    struct ib_port_immutable *immutable)
 {
@@ -163,6 +148,14 @@ static const struct attribute_group qedr_attr_group = {
 
 static const struct ib_device_ops qedr_iw_dev_ops = {
        .get_port_immutable = qedr_iw_port_immutable,
+       .iw_accept = qedr_iw_accept,
+       .iw_add_ref = qedr_iw_qp_add_ref,
+       .iw_connect = qedr_iw_connect,
+       .iw_create_listen = qedr_iw_create_listen,
+       .iw_destroy_listen = qedr_iw_destroy_listen,
+       .iw_get_qp = qedr_iw_get_qp,
+       .iw_reject = qedr_iw_reject,
+       .iw_rem_ref = qedr_iw_qp_rem_ref,
        .query_gid = qedr_iw_query_gid,
 };
 
@@ -172,21 +165,8 @@ static int qedr_iw_register_device(struct qedr_dev *dev)
 
        ib_set_device_ops(&dev->ibdev, &qedr_iw_dev_ops);
 
-       dev->ibdev.iwcm = kzalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
-       if (!dev->ibdev.iwcm)
-               return -ENOMEM;
-
-       dev->ibdev.iwcm->connect = qedr_iw_connect;
-       dev->ibdev.iwcm->accept = qedr_iw_accept;
-       dev->ibdev.iwcm->reject = qedr_iw_reject;
-       dev->ibdev.iwcm->create_listen = qedr_iw_create_listen;
-       dev->ibdev.iwcm->destroy_listen = qedr_iw_destroy_listen;
-       dev->ibdev.iwcm->add_ref = qedr_iw_qp_add_ref;
-       dev->ibdev.iwcm->rem_ref = qedr_iw_qp_rem_ref;
-       dev->ibdev.iwcm->get_qp = qedr_iw_get_qp;
-
-       memcpy(dev->ibdev.iwcm->ifname,
-              dev->ndev->name, sizeof(dev->ibdev.iwcm->ifname));
+       memcpy(dev->ibdev.iw_ifname,
+              dev->ndev->name, sizeof(dev->ibdev.iw_ifname));
 
        return 0;
 }
@@ -220,7 +200,6 @@ static const struct ib_device_ops qedr_dev_ops = {
        .get_dev_fw_str = qedr_get_dev_fw_str,
        .get_dma_mr = qedr_get_dma_mr,
        .get_link_layer = qedr_link_layer,
-       .get_netdev = qedr_get_netdev,
        .map_mr_sg = qedr_map_mr_sg,
        .mmap = qedr_mmap,
        .modify_port = qedr_modify_port,
@@ -239,7 +218,10 @@ static const struct ib_device_ops qedr_dev_ops = {
        .reg_user_mr = qedr_reg_user_mr,
        .req_notify_cq = qedr_arm_cq,
        .resize_cq = qedr_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, qedr_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, qedr_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, qedr_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, qedr_ucontext, ibucontext),
 };
 
@@ -293,6 +275,10 @@ static int qedr_register_device(struct qedr_dev *dev)
        ib_set_device_ops(&dev->ibdev, &qedr_dev_ops);
 
        dev->ibdev.driver_id = RDMA_DRIVER_QEDR;
+       rc = ib_device_set_netdev(&dev->ibdev, dev->ndev, 1);
+       if (rc)
+               return rc;
+
        return ib_register_device(&dev->ibdev, "qedr%d");
 }
 
@@ -364,8 +350,7 @@ static int qedr_alloc_resources(struct qedr_dev *dev)
        spin_lock_init(&dev->sgid_lock);
 
        if (IS_IWARP(dev)) {
-               spin_lock_init(&dev->qpidr.idr_lock);
-               idr_init(&dev->qpidr.idr);
+               xa_init_flags(&dev->qps, XA_FLAGS_LOCK_IRQ);
                dev->iwarp_wq = create_singlethread_workqueue("qedr_iwarpq");
        }
 
@@ -760,8 +745,8 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle)
                break;
        case EVENT_TYPE_SRQ:
                srq_id = (u16)roce_handle64;
-               spin_lock_irqsave(&dev->srqidr.idr_lock, flags);
-               srq = idr_find(&dev->srqidr.idr, srq_id);
+               xa_lock_irqsave(&dev->srqs, flags);
+               srq = xa_load(&dev->srqs, srq_id);
                if (srq) {
                        ibsrq = &srq->ibsrq;
                        if (ibsrq->event_handler) {
@@ -775,7 +760,7 @@ static void qedr_affiliated_event(void *context, u8 e_code, void *fw_handle)
                                  "SRQ event with NULL pointer ibsrq. Handle=%llx\n",
                                  roce_handle64);
                }
-               spin_unlock_irqrestore(&dev->srqidr.idr_lock, flags);
+               xa_unlock_irqrestore(&dev->srqs, flags);
                DP_NOTICE(dev, "SRQ event %d on handle %p\n", e_code, srq);
        default:
                break;
index 53bbe6b..6175d1e 100644 (file)
@@ -33,7 +33,7 @@
 #define __QEDR_H__
 
 #include <linux/pci.h>
-#include <linux/idr.h>
+#include <linux/xarray.h>
 #include <rdma/ib_addr.h>
 #include <linux/qed/qed_if.h>
 #include <linux/qed/qed_chain.h>
@@ -123,11 +123,6 @@ struct qedr_device_attr {
 
 #define QEDR_ENET_STATE_BIT    (0)
 
-struct qedr_idr {
-       spinlock_t idr_lock; /* Protect idr data-structure */
-       struct idr idr;
-};
-
 struct qedr_dev {
        struct ib_device        ibdev;
        struct qed_dev          *cdev;
@@ -171,8 +166,8 @@ struct qedr_dev {
        struct qedr_cq          *gsi_rqcq;
        struct qedr_qp          *gsi_qp;
        enum qed_rdma_type      rdma_type;
-       struct qedr_idr         qpidr;
-       struct qedr_idr         srqidr;
+       struct xarray           qps;
+       struct xarray           srqs;
        struct workqueue_struct *iwarp_wq;
        u16                     iwarp_max_mtu;
 
index 0555e5a..22881d4 100644 (file)
@@ -491,7 +491,7 @@ int qedr_iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
        int rc = 0;
        int i;
 
-       qp = idr_find(&dev->qpidr.idr, conn_param->qpn);
+       qp = xa_load(&dev->qps, conn_param->qpn);
        if (unlikely(!qp))
                return -EINVAL;
 
@@ -681,7 +681,7 @@ int qedr_iw_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
 
        DP_DEBUG(dev, QEDR_MSG_IWARP, "Accept on qpid=%d\n", conn_param->qpn);
 
-       qp = idr_find(&dev->qpidr.idr, conn_param->qpn);
+       qp = xa_load(&dev->qps, conn_param->qpn);
        if (!qp) {
                DP_ERR(dev, "Invalid QP number %d\n", conn_param->qpn);
                return -EINVAL;
@@ -739,9 +739,7 @@ void qedr_iw_qp_rem_ref(struct ib_qp *ibqp)
        struct qedr_qp *qp = get_qedr_qp(ibqp);
 
        if (atomic_dec_and_test(&qp->refcnt)) {
-               spin_lock_irq(&qp->dev->qpidr.idr_lock);
-               idr_remove(&qp->dev->qpidr.idr, qp->qp_id);
-               spin_unlock_irq(&qp->dev->qpidr.idr_lock);
+               xa_erase_irq(&qp->dev->qps, qp->qp_id);
                kfree(qp);
        }
 }
@@ -750,5 +748,5 @@ struct ib_qp *qedr_iw_get_qp(struct ib_device *ibdev, int qpn)
 {
        struct qedr_dev *dev = get_qedr_dev(ibdev);
 
-       return idr_find(&dev->qpidr.idr, qpn);
+       return xa_load(&dev->qps, qpn);
 }
index e1ac2fd..f5542d7 100644 (file)
@@ -397,14 +397,17 @@ static inline int qedr_gsi_build_header(struct qedr_dev *dev,
        bool has_udp = false;
        int i;
 
-       send_size = 0;
-       for (i = 0; i < swr->num_sge; ++i)
-               send_size += swr->sg_list[i].length;
+       rc = rdma_read_gid_l2_fields(sgid_attr, &vlan_id, NULL);
+       if (rc)
+               return rc;
 
-       vlan_id = rdma_vlan_dev_vlan_id(sgid_attr->ndev);
        if (vlan_id < VLAN_CFI_MASK)
                has_vlan = true;
 
+       send_size = 0;
+       for (i = 0; i < swr->num_sge; ++i)
+               send_size += swr->sg_list[i].length;
+
        has_udp = (sgid_attr->gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP);
        if (!has_udp) {
                /* RoCE v1 */
index 8686a98..3d7bde1 100644 (file)
@@ -42,6 +42,7 @@
 #include <rdma/ib_umem.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_cache.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include <linux/qed/common_hsi.h>
 #include "qedr_hsi_rdma.h"
@@ -436,8 +437,7 @@ int qedr_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
                                  vma->vm_page_prot);
 }
 
-int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                 struct ib_udata *udata)
+int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct qedr_dev *dev = get_qedr_dev(ibdev);
@@ -446,7 +446,7 @@ int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        int rc;
 
        DP_DEBUG(dev, QEDR_MSG_INIT, "Function called from: %s\n",
-                (udata && context) ? "User Lib" : "Kernel");
+                udata ? "User Lib" : "Kernel");
 
        if (!dev->rdma_ctx) {
                DP_ERR(dev, "invalid RDMA context\n");
@@ -459,10 +459,12 @@ int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
 
        pd->pd_id = pd_id;
 
-       if (udata && context) {
+       if (udata) {
                struct qedr_alloc_pd_uresp uresp = {
                        .pd_id = pd_id,
                };
+               struct qedr_ucontext *context = rdma_udata_to_drv_context(
+                       udata, struct qedr_ucontext, ibucontext);
 
                rc = qedr_ib_copy_to_udata(udata, &uresp, sizeof(uresp));
                if (rc) {
@@ -471,14 +473,14 @@ int qedr_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
                        return rc;
                }
 
-               pd->uctx = get_qedr_ucontext(context);
+               pd->uctx = context;
                pd->uctx->pd = pd;
        }
 
        return 0;
 }
 
-void qedr_dealloc_pd(struct ib_pd *ibpd)
+void qedr_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct qedr_dev *dev = get_qedr_dev(ibpd->device);
        struct qedr_pd *pd = get_qedr_pd(ibpd);
@@ -813,9 +815,10 @@ int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
 
 struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
                             const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *ib_ctx, struct ib_udata *udata)
+                            struct ib_udata *udata)
 {
-       struct qedr_ucontext *ctx = get_qedr_ucontext(ib_ctx);
+       struct qedr_ucontext *ctx = rdma_udata_to_drv_context(
+               udata, struct qedr_ucontext, ibucontext);
        struct qed_rdma_destroy_cq_out_params destroy_oparams;
        struct qed_rdma_destroy_cq_in_params destroy_iparams;
        struct qedr_dev *dev = get_qedr_dev(ibdev);
@@ -903,7 +906,7 @@ struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
        cq->sig = QEDR_CQ_MAGIC_NUMBER;
        spin_lock_init(&cq->cq_lock);
 
-       if (ib_ctx) {
+       if (udata) {
                rc = qedr_copy_cq_uresp(dev, cq, udata);
                if (rc)
                        goto err3;
@@ -959,7 +962,7 @@ int qedr_resize_cq(struct ib_cq *ibcq, int new_cnt, struct ib_udata *udata)
 #define QEDR_DESTROY_CQ_MAX_ITERATIONS         (10)
 #define QEDR_DESTROY_CQ_ITER_DURATION          (10)
 
-int qedr_destroy_cq(struct ib_cq *ibcq)
+int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct qedr_dev *dev = get_qedr_dev(ibcq->device);
        struct qed_rdma_destroy_cq_out_params oparams;
@@ -983,7 +986,7 @@ int qedr_destroy_cq(struct ib_cq *ibcq)
 
        dev->ops->common->chain_free(dev->cdev, &cq->pbl);
 
-       if (ibcq->uobject && ibcq->uobject->context) {
+       if (udata) {
                qedr_free_pbl(dev, &cq->q.pbl_info, cq->q.pbl_tbl);
                ib_umem_release(cq->q.umem);
        }
@@ -1044,10 +1047,13 @@ static inline int get_gid_info_from_table(struct ib_qp *ibqp,
        enum rdma_network_type nw_type;
        const struct ib_global_route *grh = rdma_ah_read_grh(&attr->ah_attr);
        u32 ipv4_addr;
+       int ret;
        int i;
 
        gid_attr = grh->sgid_attr;
-       qp_params->vlan_id = rdma_vlan_dev_vlan_id(gid_attr->ndev);
+       ret = rdma_read_gid_l2_fields(gid_attr, &qp_params->vlan_id, NULL);
+       if (ret)
+               return ret;
 
        nw_type = rdma_gid_attr_network_type(gid_attr);
        switch (nw_type) {
@@ -1261,7 +1267,7 @@ static void qedr_set_roce_db_info(struct qedr_dev *dev, struct qedr_qp *qp)
        }
 }
 
-static int qedr_check_srq_params(struct ib_pd *ibpd, struct qedr_dev *dev,
+static int qedr_check_srq_params(struct qedr_dev *dev,
                                 struct ib_srq_init_attr *attrs,
                                 struct ib_udata *udata)
 {
@@ -1377,38 +1383,28 @@ err0:
        return rc;
 }
 
-static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr,
-                       void *ptr, u32 id);
-static void qedr_idr_remove(struct qedr_dev *dev,
-                           struct qedr_idr *qidr, u32 id);
-
-struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
-                              struct ib_srq_init_attr *init_attr,
-                              struct ib_udata *udata)
+int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+                   struct ib_udata *udata)
 {
        struct qed_rdma_destroy_srq_in_params destroy_in_params;
        struct qed_rdma_create_srq_in_params in_params = {};
-       struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+       struct qedr_dev *dev = get_qedr_dev(ibsrq->device);
        struct qed_rdma_create_srq_out_params out_params;
-       struct qedr_pd *pd = get_qedr_pd(ibpd);
+       struct qedr_pd *pd = get_qedr_pd(ibsrq->pd);
        struct qedr_create_srq_ureq ureq = {};
        u64 pbl_base_addr, phy_prod_pair_addr;
        struct qedr_srq_hwq_info *hw_srq;
        u32 page_cnt, page_size;
-       struct qedr_srq *srq;
+       struct qedr_srq *srq = get_qedr_srq(ibsrq);
        int rc = 0;
 
        DP_DEBUG(dev, QEDR_MSG_QP,
                 "create SRQ called from %s (pd %p)\n",
                 (udata) ? "User lib" : "kernel", pd);
 
-       rc = qedr_check_srq_params(ibpd, dev, init_attr, udata);
+       rc = qedr_check_srq_params(dev, init_attr, udata);
        if (rc)
-               return ERR_PTR(-EINVAL);
-
-       srq = kzalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        srq->dev = dev;
        hw_srq = &srq->hw_srq;
@@ -1464,13 +1460,13 @@ struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
                        goto err2;
        }
 
-       rc = qedr_idr_add(dev, &dev->srqidr, srq, srq->srq_id);
+       rc = xa_insert_irq(&dev->srqs, srq->srq_id, srq, GFP_KERNEL);
        if (rc)
                goto err2;
 
        DP_DEBUG(dev, QEDR_MSG_SRQ,
                 "create srq: created srq with srq_id=0x%0x\n", srq->srq_id);
-       return &srq->ibsrq;
+       return 0;
 
 err2:
        destroy_in_params.srq_id = srq->srq_id;
@@ -1482,18 +1478,16 @@ err1:
        else
                qedr_free_srq_kernel_params(srq);
 err0:
-       kfree(srq);
-
-       return ERR_PTR(-EFAULT);
+       return -EFAULT;
 }
 
-int qedr_destroy_srq(struct ib_srq *ibsrq)
+void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
        struct qed_rdma_destroy_srq_in_params in_params = {};
        struct qedr_dev *dev = get_qedr_dev(ibsrq->device);
        struct qedr_srq *srq = get_qedr_srq(ibsrq);
 
-       qedr_idr_remove(dev, &dev->srqidr, srq->srq_id);
+       xa_erase_irq(&dev->srqs, srq->srq_id);
        in_params.srq_id = srq->srq_id;
        dev->ops->rdma_destroy_srq(dev->rdma_ctx, &in_params);
 
@@ -1505,9 +1499,6 @@ int qedr_destroy_srq(struct ib_srq *ibsrq)
        DP_DEBUG(dev, QEDR_MSG_SRQ,
                 "destroy srq: destroyed srq with srq_id=0x%0x\n",
                 srq->srq_id);
-       kfree(srq);
-
-       return 0;
 }
 
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -1593,29 +1584,6 @@ static inline void qedr_qp_user_print(struct qedr_dev *dev, struct qedr_qp *qp)
                 qp->usq.buf_len, qp->urq.buf_addr, qp->urq.buf_len);
 }
 
-static int qedr_idr_add(struct qedr_dev *dev, struct qedr_idr *qidr,
-                       void *ptr, u32 id)
-{
-       int rc;
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irq(&qidr->idr_lock);
-
-       rc = idr_alloc(&qidr->idr, ptr, id, id + 1, GFP_ATOMIC);
-
-       spin_unlock_irq(&qidr->idr_lock);
-       idr_preload_end();
-
-       return rc < 0 ? rc : 0;
-}
-
-static void qedr_idr_remove(struct qedr_dev *dev, struct qedr_idr *qidr, u32 id)
-{
-       spin_lock_irq(&qidr->idr_lock);
-       idr_remove(&qidr->idr, id);
-       spin_unlock_irq(&qidr->idr_lock);
-}
-
 static inline void
 qedr_iwarp_populate_user_qp(struct qedr_dev *dev,
                            struct qedr_qp *qp,
@@ -1985,7 +1953,7 @@ struct ib_qp *qedr_create_qp(struct ib_pd *ibpd,
        qp->ibqp.qp_num = qp->qp_id;
 
        if (rdma_protocol_iwarp(&dev->ibdev, 1)) {
-               rc = qedr_idr_add(dev, &dev->qpidr, qp, qp->qp_id);
+               rc = xa_insert_irq(&dev->qps, qp->qp_id, qp, GFP_KERNEL);
                if (rc)
                        goto err;
        }
@@ -2493,7 +2461,8 @@ err:
        return rc;
 }
 
-static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
+static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp,
+                                 struct ib_udata *udata)
 {
        int rc = 0;
 
@@ -2503,7 +2472,7 @@ static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
                        return rc;
        }
 
-       if (qp->ibqp.uobject && qp->ibqp.uobject->context)
+       if (udata)
                qedr_cleanup_user(dev, qp);
        else
                qedr_cleanup_kernel(dev, qp);
@@ -2511,7 +2480,7 @@ static int qedr_free_qp_resources(struct qedr_dev *dev, struct qedr_qp *qp)
        return 0;
 }
 
-int qedr_destroy_qp(struct ib_qp *ibqp)
+int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct qedr_qp *qp = get_qedr_qp(ibqp);
        struct qedr_dev *dev = qp->dev;
@@ -2555,37 +2524,31 @@ int qedr_destroy_qp(struct ib_qp *ibqp)
        if (qp->qp_type == IB_QPT_GSI)
                qedr_destroy_gsi_qp(dev);
 
-       qedr_free_qp_resources(dev, qp);
+       qedr_free_qp_resources(dev, qp, udata);
 
        if (atomic_dec_and_test(&qp->refcnt) &&
            rdma_protocol_iwarp(&dev->ibdev, 1)) {
-               qedr_idr_remove(dev, &dev->qpidr, qp->qp_id);
+               xa_erase_irq(&dev->qps, qp->qp_id);
                kfree(qp);
        }
        return rc;
 }
 
-struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-                            u32 flags, struct ib_udata *udata)
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+                  struct ib_udata *udata)
 {
-       struct qedr_ah *ah;
-
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
+       struct qedr_ah *ah = get_qedr_ah(ibah);
 
        rdma_copy_ah_attr(&ah->attr, attr);
 
-       return &ah->ibah;
+       return 0;
 }
 
-int qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
+void qedr_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
        struct qedr_ah *ah = get_qedr_ah(ibah);
 
        rdma_destroy_ah_attr(&ah->attr);
-       kfree(ah);
-       return 0;
 }
 
 static void free_mr_info(struct qedr_dev *dev, struct mr_info *info)
@@ -2734,7 +2697,7 @@ err0:
        return ERR_PTR(rc);
 }
 
-int qedr_dereg_mr(struct ib_mr *ib_mr)
+int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata)
 {
        struct qedr_mr *mr = get_qedr_mr(ib_mr);
        struct qedr_dev *dev = get_qedr_dev(ib_mr->device);
@@ -2826,8 +2789,8 @@ err0:
        return ERR_PTR(rc);
 }
 
-struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd,
-                           enum ib_mr_type mr_type, u32 max_num_sg)
+struct ib_mr *qedr_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+                           u32 max_num_sg, struct ib_udata *udata)
 {
        struct qedr_mr *mr;
 
index f0c05f4..9328c80 100644 (file)
@@ -47,16 +47,14 @@ int qedr_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void qedr_dealloc_ucontext(struct ib_ucontext *uctx);
 
 int qedr_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
-int qedr_alloc_pd(struct ib_pd *pd, struct ib_ucontext *uctx,
-                 struct ib_udata *udata);
-void qedr_dealloc_pd(struct ib_pd *pd);
+int qedr_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void qedr_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 
 struct ib_cq *qedr_create_cq(struct ib_device *ibdev,
                             const struct ib_cq_init_attr *attr,
-                            struct ib_ucontext *ib_ctx,
                             struct ib_udata *udata);
 int qedr_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
-int qedr_destroy_cq(struct ib_cq *);
+int qedr_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int qedr_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
 struct ib_qp *qedr_create_qp(struct ib_pd *, struct ib_qp_init_attr *attrs,
                             struct ib_udata *);
@@ -64,22 +62,21 @@ int qedr_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
                   int attr_mask, struct ib_udata *udata);
 int qedr_query_qp(struct ib_qp *, struct ib_qp_attr *qp_attr,
                  int qp_attr_mask, struct ib_qp_init_attr *);
-int qedr_destroy_qp(struct ib_qp *ibqp);
+int qedr_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 
-struct ib_srq *qedr_create_srq(struct ib_pd *ibpd,
-                              struct ib_srq_init_attr *attr,
-                              struct ib_udata *udata);
+int qedr_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *attr,
+                   struct ib_udata *udata);
 int qedr_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int qedr_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-int qedr_destroy_srq(struct ib_srq *ibsrq);
+void qedr_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 int qedr_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
                       const struct ib_recv_wr **bad_recv_wr);
-struct ib_ah *qedr_create_ah(struct ib_pd *ibpd, struct rdma_ah_attr *attr,
-                            u32 flags, struct ib_udata *udata);
-int qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
+int qedr_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr, u32 flags,
+                  struct ib_udata *udata);
+void qedr_destroy_ah(struct ib_ah *ibah, u32 flags);
 
-int qedr_dereg_mr(struct ib_mr *);
+int qedr_dereg_mr(struct ib_mr *ib_mr, struct ib_udata *udata);
 struct ib_mr *qedr_get_dma_mr(struct ib_pd *, int acc);
 
 struct ib_mr *qedr_reg_user_mr(struct ib_pd *, u64 start, u64 length,
@@ -89,7 +86,7 @@ int qedr_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
                   int sg_nents, unsigned int *sg_offset);
 
 struct ib_mr *qedr_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                           u32 max_num_sg);
+                           u32 max_num_sg, struct ib_udata *udata);
 int qedr_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
 int qedr_post_send(struct ib_qp *, const struct ib_send_wr *,
                   const struct ib_send_wr **bad_wr);
index 83d2349..432d6d0 100644 (file)
@@ -52,6 +52,7 @@
 #include <linux/kref.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
+#include <linux/xarray.h>
 #include <rdma/ib_hdrs.h>
 #include <rdma/rdma_vt.h>
 
@@ -1105,8 +1106,7 @@ struct qib_filedata {
        int rec_cpu_num; /* for cpu affinity; -1 if none */
 };
 
-extern struct list_head qib_dev_list;
-extern spinlock_t qib_devs_lock;
+extern struct xarray qib_dev_table;
 extern struct qib_devdata *qib_lookup(int unit);
 extern u32 qib_cpulist_count;
 extern unsigned long *qib_cpulist;
index a4a1f56..f91f23e 100644 (file)
@@ -57,7 +57,7 @@
  * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in
  * fastpath code
  * QIB_TRACE_REGWRITES define as 1 if you want register writes to be
- * traced in faspath code
+ * traced in fastpath code
  * _QIB_TRACING define as 0 if you want to remove all tracing in a
  * compilation unit
  */
index 3117cc5..92eeea5 100644 (file)
@@ -49,8 +49,6 @@
  */
 const char ib_qib_version[] = QIB_DRIVER_VERSION "\n";
 
-DEFINE_SPINLOCK(qib_devs_lock);
-LIST_HEAD(qib_dev_list);
 DEFINE_MUTEX(qib_mutex);       /* general driver use */
 
 unsigned qib_ibmtu;
@@ -96,11 +94,11 @@ int qib_count_active_units(void)
 {
        struct qib_devdata *dd;
        struct qib_pportdata *ppd;
-       unsigned long flags;
+       unsigned long index, flags;
        int pidx, nunits_active = 0;
 
-       spin_lock_irqsave(&qib_devs_lock, flags);
-       list_for_each_entry(dd, &qib_dev_list, list) {
+       xa_lock_irqsave(&qib_dev_table, flags);
+       xa_for_each(&qib_dev_table, index, dd) {
                if (!(dd->flags & QIB_PRESENT) || !dd->kregbase)
                        continue;
                for (pidx = 0; pidx < dd->num_pports; ++pidx) {
@@ -112,7 +110,7 @@ int qib_count_active_units(void)
                        }
                }
        }
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
+       xa_unlock_irqrestore(&qib_dev_table, flags);
        return nunits_active;
 }
 
@@ -125,13 +123,12 @@ int qib_count_units(int *npresentp, int *nupp)
 {
        int nunits = 0, npresent = 0, nup = 0;
        struct qib_devdata *dd;
-       unsigned long flags;
+       unsigned long index, flags;
        int pidx;
        struct qib_pportdata *ppd;
 
-       spin_lock_irqsave(&qib_devs_lock, flags);
-
-       list_for_each_entry(dd, &qib_dev_list, list) {
+       xa_lock_irqsave(&qib_dev_table, flags);
+       xa_for_each(&qib_dev_table, index, dd) {
                nunits++;
                if ((dd->flags & QIB_PRESENT) && dd->kregbase)
                        npresent++;
@@ -142,8 +139,7 @@ int qib_count_units(int *npresentp, int *nupp)
                                nup++;
                }
        }
-
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
+       xa_unlock_irqrestore(&qib_dev_table, flags);
 
        if (npresentp)
                *npresentp = npresent;
index 1d940a2..ceb42d9 100644 (file)
@@ -508,8 +508,8 @@ bail:
  */
 static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
 {
-       struct qib_devdata *dd, *tmp;
-       unsigned long flags;
+       struct qib_devdata *dd;
+       unsigned long index;
        int ret;
 
        static const struct tree_descr files[] = {
@@ -524,18 +524,12 @@ static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
                goto bail;
        }
 
-       spin_lock_irqsave(&qib_devs_lock, flags);
-
-       list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) {
-               spin_unlock_irqrestore(&qib_devs_lock, flags);
+       xa_for_each(&qib_dev_table, index, dd) {
                ret = add_cntr_files(sb, dd);
                if (ret)
                        goto bail;
-               spin_lock_irqsave(&qib_devs_lock, flags);
        }
 
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
-
 bail:
        return ret;
 }
index ac6a84f..dd48433 100644 (file)
@@ -6137,7 +6137,7 @@ static void set_no_qsfp_atten(struct qib_devdata *dd, int change)
 static int setup_txselect(const char *str, const struct kernel_param *kp)
 {
        struct qib_devdata *dd;
-       unsigned long val;
+       unsigned long index, val;
        char *n;
 
        if (strlen(str) >= ARRAY_SIZE(txselect_list)) {
@@ -6153,7 +6153,7 @@ static int setup_txselect(const char *str, const struct kernel_param *kp)
        }
        strncpy(txselect_list, str, ARRAY_SIZE(txselect_list) - 1);
 
-       list_for_each_entry(dd, &qib_dev_list, list)
+       xa_for_each(&qib_dev_table, index, dd)
                if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
                        set_no_qsfp_atten(dd, 1);
        return 0;
index 9fd6990..d4fd8a6 100644 (file)
@@ -36,7 +36,6 @@
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
 #include <linux/delay.h>
-#include <linux/idr.h>
 #include <linux/module.h>
 #include <linux/printk.h>
 #ifdef CONFIG_INFINIBAND_QIB_DCA
@@ -95,7 +94,7 @@ MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disable
 
 static void verify_interrupt(struct timer_list *);
 
-static struct idr qib_unit_table;
+DEFINE_XARRAY_FLAGS(qib_dev_table, XA_FLAGS_ALLOC | XA_FLAGS_LOCK_IRQ);
 u32 qib_cpulist_count;
 unsigned long *qib_cpulist;
 
@@ -785,21 +784,9 @@ void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd)
 {
 }
 
-static inline struct qib_devdata *__qib_lookup(int unit)
-{
-       return idr_find(&qib_unit_table, unit);
-}
-
 struct qib_devdata *qib_lookup(int unit)
 {
-       struct qib_devdata *dd;
-       unsigned long flags;
-
-       spin_lock_irqsave(&qib_devs_lock, flags);
-       dd = __qib_lookup(unit);
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
-
-       return dd;
+       return xa_load(&qib_dev_table, unit);
 }
 
 /*
@@ -1046,10 +1033,9 @@ void qib_free_devdata(struct qib_devdata *dd)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&qib_devs_lock, flags);
-       idr_remove(&qib_unit_table, dd->unit);
-       list_del(&dd->list);
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
+       xa_lock_irqsave(&qib_dev_table, flags);
+       __xa_erase(&qib_dev_table, dd->unit);
+       xa_unlock_irqrestore(&qib_dev_table, flags);
 
 #ifdef CONFIG_DEBUG_FS
        qib_dbg_ibdev_exit(&dd->verbs_dev);
@@ -1070,15 +1056,15 @@ u64 qib_int_counter(struct qib_devdata *dd)
 
 u64 qib_sps_ints(void)
 {
-       unsigned long flags;
+       unsigned long index, flags;
        struct qib_devdata *dd;
        u64 sps_ints = 0;
 
-       spin_lock_irqsave(&qib_devs_lock, flags);
-       list_for_each_entry(dd, &qib_dev_list, list) {
+       xa_lock_irqsave(&qib_dev_table, flags);
+       xa_for_each(&qib_dev_table, index, dd) {
                sps_ints += qib_int_counter(dd);
        }
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
+       xa_unlock_irqrestore(&qib_dev_table, flags);
        return sps_ints;
 }
 
@@ -1087,12 +1073,9 @@ u64 qib_sps_ints(void)
  * allocator, because the verbs cleanup process both does cleanup and
  * free of the data structure.
  * "extra" is for chip-specific data.
- *
- * Use the idr mechanism to get a unit number for this unit.
  */
 struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
 {
-       unsigned long flags;
        struct qib_devdata *dd;
        int ret, nports;
 
@@ -1103,20 +1086,8 @@ struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
        if (!dd)
                return ERR_PTR(-ENOMEM);
 
-       INIT_LIST_HEAD(&dd->list);
-
-       idr_preload(GFP_KERNEL);
-       spin_lock_irqsave(&qib_devs_lock, flags);
-
-       ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT);
-       if (ret >= 0) {
-               dd->unit = ret;
-               list_add(&dd->list, &qib_dev_list);
-       }
-
-       spin_unlock_irqrestore(&qib_devs_lock, flags);
-       idr_preload_end();
-
+       ret = xa_alloc_irq(&qib_dev_table, &dd->unit, dd, xa_limit_32b,
+                       GFP_KERNEL);
        if (ret < 0) {
                qib_early_err(&pdev->dev,
                              "Could not allocate unit ID: error %d\n", -ret);
@@ -1255,8 +1226,6 @@ static int __init qib_ib_init(void)
         * These must be called before the driver is registered with
         * the PCI subsystem.
         */
-       idr_init(&qib_unit_table);
-
 #ifdef CONFIG_INFINIBAND_QIB_DCA
        dca_register_notify(&dca_notifier);
 #endif
@@ -1281,7 +1250,6 @@ bail_dev:
 #ifdef CONFIG_DEBUG_FS
        qib_dbg_exit();
 #endif
-       idr_destroy(&qib_unit_table);
        qib_dev_cleanup();
 bail:
        return ret;
@@ -1313,7 +1281,7 @@ static void __exit qib_ib_cleanup(void)
        qib_cpulist_count = 0;
        kfree(qib_cpulist);
 
-       idr_destroy(&qib_unit_table);
+       WARN_ON(!xa_empty(&qib_dev_table));
        qib_dev_cleanup();
 }
 
index 50dd981..2ac4c67 100644 (file)
@@ -933,7 +933,7 @@ void qib_rc_send_complete(struct rvt_qp *qp, struct ib_header *hdr)
                qp->s_last = s_last;
                /* see post_send() */
                barrier();
-               rvt_put_swqe(wqe);
+               rvt_put_qp_swqe(qp, wqe);
                rvt_qp_swqe_complete(qp,
                                     wqe,
                                     ib_qib_wc_opcode[wqe->wr.opcode],
@@ -975,7 +975,7 @@ static struct rvt_swqe *do_rc_completion(struct rvt_qp *qp,
            qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
                u32 s_last;
 
-               rvt_put_swqe(wqe);
+               rvt_put_qp_swqe(qp, wqe);
                s_last = qp->s_last;
                if (++s_last >= qp->s_size)
                        s_last = 0;
index 31c523b..ef19d39 100644 (file)
@@ -225,8 +225,6 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
        if (sdma_rb_node) {
                sdma_rb_node->refcount++;
        } else {
-               int ret;
-
                sdma_rb_node = kmalloc(sizeof(
                        struct qib_user_sdma_rb_node), GFP_KERNEL);
                if (!sdma_rb_node)
@@ -235,8 +233,7 @@ qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
                sdma_rb_node->refcount = 1;
                sdma_rb_node->pid = current->pid;
 
-               ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
-                                       sdma_rb_node);
+               qib_user_sdma_rb_insert(&qib_user_sdma_rb_root, sdma_rb_node);
        }
        pq->sdma_rb_node = sdma_rb_node;
 
index a4426c2..17bdf8a 100644 (file)
@@ -46,7 +46,7 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/ib_hdrs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 #include <rdma/rdmavt_cq.h>
 
 struct qib_ctxtdata;
index bd4521b..e935275 100644 (file)
@@ -447,8 +447,7 @@ int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
        return 0;
 }
 
-int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                     struct ib_udata *udata)
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct usnic_ib_pd *pd = to_upd(ibpd);
        void *umem_pd;
@@ -461,7 +460,7 @@ int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        return 0;
 }
 
-void usnic_ib_dealloc_pd(struct ib_pd *pd)
+void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
 }
@@ -539,7 +538,7 @@ out_release_mutex:
        return ERR_PTR(err);
 }
 
-int usnic_ib_destroy_qp(struct ib_qp *qp)
+int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
        struct usnic_ib_qp_grp *qp_grp;
        struct usnic_ib_vf *vf;
@@ -590,7 +589,6 @@ out_unlock:
 
 struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
                                 const struct ib_cq_init_attr *attr,
-                                struct ib_ucontext *context,
                                 struct ib_udata *udata)
 {
        struct ib_cq *cq;
@@ -606,7 +604,7 @@ struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
        return cq;
 }
 
-int usnic_ib_destroy_cq(struct ib_cq *cq)
+int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        usnic_dbg("\n");
        kfree(cq);
@@ -642,13 +640,13 @@ err_free:
        return ERR_PTR(err);
 }
 
-int usnic_ib_dereg_mr(struct ib_mr *ibmr)
+int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct usnic_ib_mr *mr = to_umr(ibmr);
 
        usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length);
 
-       usnic_uiom_reg_release(mr->umem, ibmr->uobject->context);
+       usnic_uiom_reg_release(mr->umem);
        kfree(mr);
        return 0;
 }
@@ -731,4 +729,3 @@ int usnic_ib_mmap(struct ib_ucontext *context,
        return -EINVAL;
 }
 
-/* End of ib callbacks section */
index c40e89b..028f322 100644 (file)
@@ -50,24 +50,22 @@ int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
                                union ib_gid *gid);
 int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
                                u16 *pkey);
-int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                     struct ib_udata *udata);
-void usnic_ib_dealloc_pd(struct ib_pd *pd);
+int usnic_ib_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
+void usnic_ib_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata);
 struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
                                        struct ib_qp_init_attr *init_attr,
                                        struct ib_udata *udata);
-int usnic_ib_destroy_qp(struct ib_qp *qp);
+int usnic_ib_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                                int attr_mask, struct ib_udata *udata);
 struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev,
                                 const struct ib_cq_init_attr *attr,
-                                struct ib_ucontext *context,
                                 struct ib_udata *udata);
-int usnic_ib_destroy_cq(struct ib_cq *cq);
+int usnic_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
                                u64 virt_addr, int access_flags,
                                struct ib_udata *udata);
-int usnic_ib_dereg_mr(struct ib_mr *ibmr);
+int usnic_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
 int usnic_ib_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
 int usnic_ib_mmap(struct ib_ucontext *context,
index 06862a6..da35d6f 100644 (file)
@@ -432,8 +432,7 @@ static inline size_t usnic_uiom_num_pages(struct usnic_uiom_reg *uiomr)
        return PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
 }
 
-void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
-                           struct ib_ucontext *context)
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr)
 {
        __usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
 
index c88cfa0..70be49b 100644 (file)
@@ -90,7 +90,6 @@ void usnic_uiom_free_dev_list(struct device **devs);
 struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
                                                unsigned long addr, size_t size,
                                                int access, int dmasync);
-void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr,
-                           struct ib_ucontext *ucontext);
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr);
 int usnic_uiom_init(char *drv_name);
 #endif /* USNIC_UIOM_H_ */
index 104c7db..d7deb19 100644 (file)
@@ -49,6 +49,7 @@
 #include <rdma/ib_addr.h>
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "pvrdma.h"
 
@@ -93,7 +94,6 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq,
  * pvrdma_create_cq - create completion queue
  * @ibdev: the device
  * @attr: completion queue attributes
- * @context: user context
  * @udata: user data
  *
  * @return: ib_cq completion queue pointer on success,
@@ -101,7 +101,6 @@ int pvrdma_req_notify_cq(struct ib_cq *ibcq,
  */
 struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
                               const struct ib_cq_init_attr *attr,
-                              struct ib_ucontext *context,
                               struct ib_udata *udata)
 {
        int entries = attr->cqe;
@@ -116,6 +115,8 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
        struct pvrdma_cmd_create_cq_resp *resp = &rsp.create_cq_resp;
        struct pvrdma_create_cq_resp cq_resp = {0};
        struct pvrdma_create_cq ucmd;
+       struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct pvrdma_ucontext, ibucontext);
 
        BUILD_BUG_ON(sizeof(struct pvrdma_cqe) != 64);
 
@@ -133,7 +134,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
        }
 
        cq->ibcq.cqe = entries;
-       cq->is_kernel = !context;
+       cq->is_kernel = !udata;
 
        if (!cq->is_kernel) {
                if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
@@ -185,8 +186,7 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
        memset(cmd, 0, sizeof(*cmd));
        cmd->hdr.cmd = PVRDMA_CMD_CREATE_CQ;
        cmd->nchunks = npages;
-       cmd->ctx_handle = (context) ?
-               (u64)to_vucontext(context)->ctx_handle : 0;
+       cmd->ctx_handle = context ? context->ctx_handle : 0;
        cmd->cqe = entries;
        cmd->pdir_dma = cq->pdir.dir_dma;
        ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_CQ_RESP);
@@ -204,13 +204,13 @@ struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
        spin_unlock_irqrestore(&dev->cq_tbl_lock, flags);
 
        if (!cq->is_kernel) {
-               cq->uar = &(to_vucontext(context)->uar);
+               cq->uar = &context->uar;
 
                /* Copy udata back. */
                if (ib_copy_to_udata(udata, &cq_resp, sizeof(cq_resp))) {
                        dev_warn(&dev->pdev->dev,
                                 "failed to copy back udata\n");
-                       pvrdma_destroy_cq(&cq->ibcq);
+                       pvrdma_destroy_cq(&cq->ibcq, udata);
                        return ERR_PTR(-EINVAL);
                }
        }
@@ -245,10 +245,11 @@ static void pvrdma_free_cq(struct pvrdma_dev *dev, struct pvrdma_cq *cq)
 /**
  * pvrdma_destroy_cq - destroy completion queue
  * @cq: the completion queue to destroy.
+ * @udata: user data or null for kernel object
  *
  * @return: 0 for success.
  */
-int pvrdma_destroy_cq(struct ib_cq *cq)
+int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
 {
        struct pvrdma_cq *vcq = to_vcq(cq);
        union pvrdma_cmd_req req;
index ec41400..4018229 100644 (file)
@@ -143,24 +143,6 @@ static int pvrdma_port_immutable(struct ib_device *ibdev, u8 port_num,
        return 0;
 }
 
-static struct net_device *pvrdma_get_netdev(struct ib_device *ibdev,
-                                           u8 port_num)
-{
-       struct net_device *netdev;
-       struct pvrdma_dev *dev = to_vdev(ibdev);
-
-       if (port_num != 1)
-               return NULL;
-
-       rcu_read_lock();
-       netdev = dev->netdev;
-       if (netdev)
-               dev_hold(netdev);
-       rcu_read_unlock();
-
-       return netdev;
-}
-
 static const struct ib_device_ops pvrdma_dev_ops = {
        .add_gid = pvrdma_add_gid,
        .alloc_mr = pvrdma_alloc_mr,
@@ -179,7 +161,6 @@ static const struct ib_device_ops pvrdma_dev_ops = {
        .get_dev_fw_str = pvrdma_get_fw_ver_str,
        .get_dma_mr = pvrdma_get_dma_mr,
        .get_link_layer = pvrdma_port_link_layer,
-       .get_netdev = pvrdma_get_netdev,
        .get_port_immutable = pvrdma_port_immutable,
        .map_mr_sg = pvrdma_map_mr_sg,
        .mmap = pvrdma_mmap,
@@ -195,6 +176,8 @@ static const struct ib_device_ops pvrdma_dev_ops = {
        .query_qp = pvrdma_query_qp,
        .reg_user_mr = pvrdma_reg_user_mr,
        .req_notify_cq = pvrdma_req_notify_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, pvrdma_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, pvrdma_pd, ibpd),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, pvrdma_ucontext, ibucontext),
 };
@@ -204,6 +187,8 @@ static const struct ib_device_ops pvrdma_dev_srq_ops = {
        .destroy_srq = pvrdma_destroy_srq,
        .modify_srq = pvrdma_modify_srq,
        .query_srq = pvrdma_query_srq,
+
+       INIT_RDMA_OBJ_SIZE(ib_srq, pvrdma_srq, ibsrq),
 };
 
 static int pvrdma_register_device(struct pvrdma_dev *dev)
@@ -277,6 +262,9 @@ static int pvrdma_register_device(struct pvrdma_dev *dev)
                        goto err_qp_free;
        }
        dev->ib_dev.driver_id = RDMA_DRIVER_VMW_PVRDMA;
+       ret = ib_device_set_netdev(&dev->ib_dev, dev->netdev, 1);
+       if (ret)
+               return ret;
        spin_lock_init(&dev->srq_tbl_lock);
        rdma_set_device_sysfs_group(&dev->ib_dev, &pvrdma_attr_group);
 
@@ -720,6 +708,7 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev,
                        pvrdma_dispatch_event(dev, 1, IB_EVENT_PORT_ACTIVE);
                break;
        case NETDEV_UNREGISTER:
+               ib_device_set_netdev(&dev->ib_dev, NULL, 1);
                dev_put(dev->netdev);
                dev->netdev = NULL;
                break;
@@ -731,6 +720,7 @@ static void pvrdma_netdevice_event_handle(struct pvrdma_dev *dev,
                if ((dev->netdev == NULL) &&
                    (pci_get_drvdata(pdev_net) == ndev)) {
                        /* this is our netdev */
+                       ib_device_set_netdev(&dev->ib_dev, ndev, 1);
                        dev->netdev = ndev;
                        dev_hold(ndev);
                }
index a85884e..65dc47f 100644 (file)
@@ -119,7 +119,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        union pvrdma_cmd_resp rsp;
        struct pvrdma_cmd_create_mr *cmd = &req.create_mr;
        struct pvrdma_cmd_create_mr_resp *resp = &rsp.create_mr_resp;
-       int ret;
+       int ret, npages;
 
        if (length == 0 || length > dev->dsr->caps.max_mr_size) {
                dev_warn(&dev->pdev->dev, "invalid mem region length\n");
@@ -133,9 +133,10 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                return ERR_CAST(umem);
        }
 
-       if (umem->npages < 0 || umem->npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
+       npages = ib_umem_num_pages(umem);
+       if (npages < 0 || npages > PVRDMA_PAGE_DIR_MAX_PAGES) {
                dev_warn(&dev->pdev->dev, "overflow %d pages in mem region\n",
-                        umem->npages);
+                        npages);
                ret = -EINVAL;
                goto err_umem;
        }
@@ -150,7 +151,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        mr->mmr.size = length;
        mr->umem = umem;
 
-       ret = pvrdma_page_dir_init(dev, &mr->pdir, umem->npages, false);
+       ret = pvrdma_page_dir_init(dev, &mr->pdir, npages, false);
        if (ret) {
                dev_warn(&dev->pdev->dev,
                         "could not allocate page directory\n");
@@ -167,7 +168,7 @@ struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        cmd->length = length;
        cmd->pd_handle = to_vpd(pd)->pd_handle;
        cmd->access_flags = access_flags;
-       cmd->nchunks = umem->npages;
+       cmd->nchunks = npages;
        cmd->pdir_dma = mr->pdir.dir_dma;
 
        ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_MR_RESP);
@@ -201,7 +202,7 @@ err_umem:
  * @return: ib_mr pointer on success, otherwise returns an errno.
  */
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                             u32 max_num_sg)
+                             u32 max_num_sg, struct ib_udata *udata)
 {
        struct pvrdma_dev *dev = to_vdev(pd->device);
        struct pvrdma_user_mr *mr;
@@ -272,7 +273,7 @@ freemr:
  *
  * @return: 0 on success.
  */
-int pvrdma_dereg_mr(struct ib_mr *ibmr)
+int pvrdma_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct pvrdma_user_mr *mr = to_vmr(ibmr);
        struct pvrdma_dev *dev = to_vdev(ibmr->device);
index 08f4257..0eaaead 100644 (file)
@@ -446,10 +446,11 @@ static void pvrdma_free_qp(struct pvrdma_qp *qp)
 /**
  * pvrdma_destroy_qp - destroy a queue pair
  * @qp: the queue pair to destroy
+ * @udata: user data or null for kernel object
  *
  * @return: 0 on success.
  */
-int pvrdma_destroy_qp(struct ib_qp *qp)
+int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata)
 {
        struct pvrdma_qp *vqp = to_vqp(qp);
        union pvrdma_cmd_req req;
index 951d9d6..6cac0c8 100644 (file)
@@ -94,19 +94,18 @@ int pvrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
  * @init_attr: shared receive queue attributes
  * @udata: user data
  *
- * @return: the ib_srq pointer on success, otherwise returns an errno.
+ * @return: 0 on success, otherwise returns an errno.
  */
-struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
-                                struct ib_srq_init_attr *init_attr,
-                                struct ib_udata *udata)
+int pvrdma_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init_attr,
+                     struct ib_udata *udata)
 {
-       struct pvrdma_srq *srq = NULL;
-       struct pvrdma_dev *dev = to_vdev(pd->device);
+       struct pvrdma_srq *srq = to_vsrq(ibsrq);
+       struct pvrdma_dev *dev = to_vdev(ibsrq->device);
        union pvrdma_cmd_req req;
        union pvrdma_cmd_resp rsp;
        struct pvrdma_cmd_create_srq *cmd = &req.create_srq;
        struct pvrdma_cmd_create_srq_resp *resp = &rsp.create_srq_resp;
-       struct pvrdma_create_srq_resp srq_resp = {0};
+       struct pvrdma_create_srq_resp srq_resp = {};
        struct pvrdma_create_srq ucmd;
        unsigned long flags;
        int ret;
@@ -115,31 +114,25 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
                /* No support for kernel clients. */
                dev_warn(&dev->pdev->dev,
                         "no shared receive queue support for kernel client\n");
-               return ERR_PTR(-EOPNOTSUPP);
+               return -EOPNOTSUPP;
        }
 
        if (init_attr->srq_type != IB_SRQT_BASIC) {
                dev_warn(&dev->pdev->dev,
                         "shared receive queue type %d not supported\n",
                         init_attr->srq_type);
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
        if (init_attr->attr.max_wr  > dev->dsr->caps.max_srq_wr ||
            init_attr->attr.max_sge > dev->dsr->caps.max_srq_sge) {
                dev_warn(&dev->pdev->dev,
                         "shared receive queue size invalid\n");
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
        }
 
        if (!atomic_add_unless(&dev->num_srqs, 1, dev->dsr->caps.max_srq))
-               return ERR_PTR(-ENOMEM);
-
-       srq = kmalloc(sizeof(*srq), GFP_KERNEL);
-       if (!srq) {
-               ret = -ENOMEM;
-               goto err_srq;
-       }
+               return -ENOMEM;
 
        spin_lock_init(&srq->lock);
        refcount_set(&srq->refcnt, 1);
@@ -181,7 +174,7 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
        cmd->hdr.cmd = PVRDMA_CMD_CREATE_SRQ;
        cmd->srq_type = init_attr->srq_type;
        cmd->nchunks = srq->npages;
-       cmd->pd_handle = to_vpd(pd)->pd_handle;
+       cmd->pd_handle = to_vpd(ibsrq->pd)->pd_handle;
        cmd->attrs.max_wr = init_attr->attr.max_wr;
        cmd->attrs.max_sge = init_attr->attr.max_sge;
        cmd->attrs.srq_limit = init_attr->attr.srq_limit;
@@ -204,21 +197,20 @@ struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
        /* Copy udata back. */
        if (ib_copy_to_udata(udata, &srq_resp, sizeof(srq_resp))) {
                dev_warn(&dev->pdev->dev, "failed to copy back udata\n");
-               pvrdma_destroy_srq(&srq->ibsrq);
-               return ERR_PTR(-EINVAL);
+               pvrdma_destroy_srq(&srq->ibsrq, udata);
+               return -EINVAL;
        }
 
-       return &srq->ibsrq;
+       return 0;
 
 err_page_dir:
        pvrdma_page_dir_cleanup(dev, &srq->pdir);
 err_umem:
        ib_umem_release(srq->umem);
 err_srq:
-       kfree(srq);
        atomic_dec(&dev->num_srqs);
 
-       return ERR_PTR(ret);
+       return ret;
 }
 
 static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
@@ -246,10 +238,11 @@ static void pvrdma_free_srq(struct pvrdma_dev *dev, struct pvrdma_srq *srq)
 /**
  * pvrdma_destroy_srq - destroy shared receive queue
  * @srq: the shared receive queue to destroy
+ * @udata: user data or null for kernel object
  *
  * @return: 0 for success.
  */
-int pvrdma_destroy_srq(struct ib_srq *srq)
+void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata)
 {
        struct pvrdma_srq *vsrq = to_vsrq(srq);
        union pvrdma_cmd_req req;
@@ -268,8 +261,6 @@ int pvrdma_destroy_srq(struct ib_srq *srq)
                         ret);
 
        pvrdma_free_srq(dev, vsrq);
-
-       return 0;
 }
 
 /**
index 42fe821..faf7ecd 100644 (file)
@@ -50,6 +50,7 @@
 #include <rdma/ib_smi.h>
 #include <rdma/ib_user_verbs.h>
 #include <rdma/vmw_pvrdma-abi.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "pvrdma.h"
 
@@ -70,8 +71,6 @@ int pvrdma_query_device(struct ib_device *ibdev,
        if (uhw->inlen || uhw->outlen)
                return -EINVAL;
 
-       memset(props, 0, sizeof(*props));
-
        props->fw_ver = dev->dsr->caps.fw_ver;
        props->sys_image_guid = dev->dsr->caps.sys_image_guid;
        props->max_mr_size = dev->dsr->caps.max_mr_size;
@@ -421,13 +420,11 @@ int pvrdma_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
 /**
  * pvrdma_alloc_pd - allocate protection domain
  * @ibpd: PD pointer
- * @context: user context
  * @udata: user data
  *
  * @return: the ib_pd protection domain pointer on success, otherwise errno.
  */
-int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                   struct ib_udata *udata)
+int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct pvrdma_pd *pd = to_vpd(ibpd);
@@ -438,13 +435,15 @@ int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        struct pvrdma_cmd_create_pd_resp *resp = &rsp.create_pd_resp;
        struct pvrdma_alloc_pd_resp pd_resp = {0};
        int ret;
+       struct pvrdma_ucontext *context = rdma_udata_to_drv_context(
+               udata, struct pvrdma_ucontext, ibucontext);
 
        /* Check allowed max pds */
        if (!atomic_add_unless(&dev->num_pds, 1, dev->dsr->caps.max_pd))
                return -ENOMEM;
 
        cmd->hdr.cmd = PVRDMA_CMD_CREATE_PD;
-       cmd->ctx_handle = (context) ? to_vucontext(context)->ctx_handle : 0;
+       cmd->ctx_handle = context ? context->ctx_handle : 0;
        ret = pvrdma_cmd_post(dev, &req, &rsp, PVRDMA_CMD_CREATE_PD_RESP);
        if (ret < 0) {
                dev_warn(&dev->pdev->dev,
@@ -453,16 +452,16 @@ int pvrdma_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
                goto err;
        }
 
-       pd->privileged = !context;
+       pd->privileged = !udata;
        pd->pd_handle = resp->pd_handle;
        pd->pdn = resp->pd_handle;
        pd_resp.pdn = resp->pd_handle;
 
-       if (context) {
+       if (udata) {
                if (ib_copy_to_udata(udata, &pd_resp, sizeof(pd_resp))) {
                        dev_warn(&dev->pdev->dev,
                                 "failed to copy back protection domain\n");
-                       pvrdma_dealloc_pd(&pd->ibpd);
+                       pvrdma_dealloc_pd(&pd->ibpd, udata);
                        return -EFAULT;
                }
        }
@@ -478,10 +477,11 @@ err:
 /**
  * pvrdma_dealloc_pd - deallocate protection domain
  * @pd: the protection domain to be released
+ * @udata: user data or null for kernel object
  *
  * @return: 0 on success, otherwise errno.
  */
-void pvrdma_dealloc_pd(struct ib_pd *pd)
+void pvrdma_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata)
 {
        struct pvrdma_dev *dev = to_vdev(pd->device);
        union pvrdma_cmd_req req = {};
@@ -507,34 +507,28 @@ void pvrdma_dealloc_pd(struct ib_pd *pd)
  * @udata: user data blob
  * @flags: create address handle flags (see enum rdma_create_ah_flags)
  *
- * @return: the ib_ah pointer on success, otherwise errno.
+ * @return: 0 on success, otherwise errno.
  */
-struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                              u32 flags, struct ib_udata *udata)
+int pvrdma_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+                    u32 flags, struct ib_udata *udata)
 {
-       struct pvrdma_dev *dev = to_vdev(pd->device);
-       struct pvrdma_ah *ah;
+       struct pvrdma_dev *dev = to_vdev(ibah->device);
+       struct pvrdma_ah *ah = to_vah(ibah);
        const struct ib_global_route *grh;
        u8 port_num = rdma_ah_get_port_num(ah_attr);
 
        if (!(rdma_ah_get_ah_flags(ah_attr) & IB_AH_GRH))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        grh = rdma_ah_read_grh(ah_attr);
        if ((ah_attr->type != RDMA_AH_ATTR_TYPE_ROCE)  ||
            rdma_is_multicast_addr((struct in6_addr *)grh->dgid.raw))
-               return ERR_PTR(-EINVAL);
+               return -EINVAL;
 
        if (!atomic_add_unless(&dev->num_ahs, 1, dev->dsr->caps.max_ah))
-               return ERR_PTR(-ENOMEM);
-
-       ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah) {
-               atomic_dec(&dev->num_ahs);
-               return ERR_PTR(-ENOMEM);
-       }
+               return -ENOMEM;
 
-       ah->av.port_pd = to_vpd(pd)->pd_handle | (port_num << 24);
+       ah->av.port_pd = to_vpd(ibah->pd)->pd_handle | (port_num << 24);
        ah->av.src_path_bits = rdma_ah_get_path_bits(ah_attr);
        ah->av.src_path_bits |= 0x80;
        ah->av.gid_index = grh->sgid_index;
@@ -544,11 +538,7 @@ struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
        memcpy(ah->av.dgid, grh->dgid.raw, 16);
        memcpy(ah->av.dmac, ah_attr->roce.dmac, ETH_ALEN);
 
-       ah->ibah.device = pd->device;
-       ah->ibah.pd = pd;
-       ah->ibah.uobject = NULL;
-
-       return &ah->ibah;
+       return 0;
 }
 
 /**
@@ -556,14 +546,10 @@ struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
  * @ah: the address handle to destroyed
  * @flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
  *
- * @return: 0 on success.
  */
-int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
+void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags)
 {
        struct pvrdma_dev *dev = to_vdev(ah->device);
 
-       kfree(to_vah(ah));
        atomic_dec(&dev->num_ahs);
-
-       return 0;
 }
index 607aa13..9d7b021 100644 (file)
@@ -398,36 +398,33 @@ int pvrdma_modify_port(struct ib_device *ibdev, u8 port,
 int pvrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 int pvrdma_alloc_ucontext(struct ib_ucontext *uctx, struct ib_udata *udata);
 void pvrdma_dealloc_ucontext(struct ib_ucontext *context);
-int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                   struct ib_udata *udata);
-void pvrdma_dealloc_pd(struct ib_pd *ibpd);
+int pvrdma_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void pvrdma_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 struct ib_mr *pvrdma_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *pvrdma_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                                 u64 virt_addr, int access_flags,
                                 struct ib_udata *udata);
-int pvrdma_dereg_mr(struct ib_mr *mr);
+int pvrdma_dereg_mr(struct ib_mr *mr, struct ib_udata *udata);
 struct ib_mr *pvrdma_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
-                             u32 max_num_sg);
+                             u32 max_num_sg, struct ib_udata *udata);
 int pvrdma_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
                     int sg_nents, unsigned int *sg_offset);
 struct ib_cq *pvrdma_create_cq(struct ib_device *ibdev,
                               const struct ib_cq_init_attr *attr,
-                              struct ib_ucontext *context,
                               struct ib_udata *udata);
-int pvrdma_destroy_cq(struct ib_cq *cq);
+int pvrdma_destroy_cq(struct ib_cq *cq, struct ib_udata *udata);
 int pvrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
 int pvrdma_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
-struct ib_ah *pvrdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr,
-                              u32 flags, struct ib_udata *udata);
-int pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
+int pvrdma_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr, u32 flags,
+                    struct ib_udata *udata);
+void pvrdma_destroy_ah(struct ib_ah *ah, u32 flags);
 
-struct ib_srq *pvrdma_create_srq(struct ib_pd *pd,
-                                struct ib_srq_init_attr *init_attr,
-                                struct ib_udata *udata);
+int pvrdma_create_srq(struct ib_srq *srq, struct ib_srq_init_attr *init_attr,
+                     struct ib_udata *udata);
 int pvrdma_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                      enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
 int pvrdma_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-int pvrdma_destroy_srq(struct ib_srq *srq);
+void pvrdma_destroy_srq(struct ib_srq *srq, struct ib_udata *udata);
 
 struct ib_qp *pvrdma_create_qp(struct ib_pd *pd,
                               struct ib_qp_init_attr *init_attr,
@@ -436,7 +433,7 @@ int pvrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                     int attr_mask, struct ib_udata *udata);
 int pvrdma_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
                    int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-int pvrdma_destroy_qp(struct ib_qp *qp);
+int pvrdma_destroy_qp(struct ib_qp *qp, struct ib_udata *udata);
 int pvrdma_post_send(struct ib_qp *ibqp, const struct ib_send_wr *wr,
                     const struct ib_send_wr **bad_wr);
 int pvrdma_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
index fc10e4e..0e147b3 100644 (file)
@@ -89,36 +89,29 @@ EXPORT_SYMBOL(rvt_check_ah);
 
 /**
  * rvt_create_ah - create an address handle
- * @pd: the protection domain
+ * @ibah: the IB address handle
  * @ah_attr: the attributes of the AH
  * @create_flags: create address handle flags (see enum rdma_create_ah_flags)
  * @udata: pointer to user's input output buffer information.
  *
  * This may be called from interrupt context.
  *
- * Return: newly allocated ah
+ * Return: 0 on success
  */
-struct ib_ah *rvt_create_ah(struct ib_pd *pd,
-                           struct rdma_ah_attr *ah_attr,
-                           u32 create_flags,
-                           struct ib_udata *udata)
+int rvt_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr,
+                 u32 create_flags, struct ib_udata *udata)
 {
-       struct rvt_ah *ah;
-       struct rvt_dev_info *dev = ib_to_rvt(pd->device);
+       struct rvt_ah *ah = ibah_to_rvtah(ibah);
+       struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
        unsigned long flags;
 
-       if (rvt_check_ah(pd->device, ah_attr))
-               return ERR_PTR(-EINVAL);
-
-       ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
+       if (rvt_check_ah(ibah->device, ah_attr))
+               return -EINVAL;
 
        spin_lock_irqsave(&dev->n_ahs_lock, flags);
        if (dev->n_ahs_allocated == dev->dparms.props.max_ah) {
                spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
-               kfree(ah);
-               return ERR_PTR(-ENOMEM);
+               return -ENOMEM;
        }
 
        dev->n_ahs_allocated++;
@@ -129,35 +122,32 @@ struct ib_ah *rvt_create_ah(struct ib_pd *pd,
        atomic_set(&ah->refcount, 0);
 
        if (dev->driver_f.notify_new_ah)
-               dev->driver_f.notify_new_ah(pd->device, ah_attr, ah);
+               dev->driver_f.notify_new_ah(ibah->device, ah_attr, ah);
 
-       return &ah->ibah;
+       return 0;
 }
 
 /**
  * rvt_destory_ah - Destory an address handle
  * @ibah: address handle
  * @destroy_flags: destroy address handle flags (see enum rdma_destroy_ah_flags)
+ * @udata: user data or NULL for kernel object
  *
  * Return: 0 on success
  */
-int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
+void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags)
 {
        struct rvt_dev_info *dev = ib_to_rvt(ibah->device);
        struct rvt_ah *ah = ibah_to_rvtah(ibah);
        unsigned long flags;
 
-       if (atomic_read(&ah->refcount) != 0)
-               return -EBUSY;
+       WARN_ON_ONCE(atomic_read(&ah->refcount));
 
        spin_lock_irqsave(&dev->n_ahs_lock, flags);
        dev->n_ahs_allocated--;
        spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
 
        rdma_destroy_ah_attr(&ah->attr);
-       kfree(ah);
-
-       return 0;
 }
 
 /**
index 72431a6..bbb4d3b 100644 (file)
 
 #include <rdma/rdma_vt.h>
 
-struct ib_ah *rvt_create_ah(struct ib_pd *pd,
-                           struct rdma_ah_attr *ah_attr,
-                           u32 create_flags,
-                           struct ib_udata *udata);
-int rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags);
+int rvt_create_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+                 u32 create_flags, struct ib_udata *udata);
+void rvt_destroy_ah(struct ib_ah *ibah, u32 destroy_flags);
 int rvt_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
 int rvt_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *ah_attr);
 
index 4f1544a..a06e6da 100644 (file)
@@ -168,7 +168,6 @@ static void send_complete(struct work_struct *work)
  * rvt_create_cq - create a completion queue
  * @ibdev: the device this completion queue is attached to
  * @attr: creation attributes
- * @context: unused by the QLogic_IB driver
  * @udata: user data for libibverbs.so
  *
  * Called by ib_create_cq() in the generic verbs code.
@@ -178,7 +177,6 @@ static void send_complete(struct work_struct *work)
  */
 struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
                            const struct ib_cq_init_attr *attr,
-                           struct ib_ucontext *context,
                            struct ib_udata *udata)
 {
        struct rvt_dev_info *rdi = ib_to_rvt(ibdev);
@@ -232,7 +230,7 @@ struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
        if (udata && udata->outlen >= sizeof(__u64)) {
                int err;
 
-               cq->ip = rvt_create_mmap_info(rdi, sz, context, wc);
+               cq->ip = rvt_create_mmap_info(rdi, sz, udata, wc);
                if (!cq->ip) {
                        ret = ERR_PTR(-ENOMEM);
                        goto bail_wc;
@@ -299,12 +297,13 @@ done:
 /**
  * rvt_destroy_cq - destroy a completion queue
  * @ibcq: the completion queue to destroy.
+ * @udata: user data or NULL for kernel object
  *
  * Called by ib_destroy_cq() in the generic verbs code.
  *
  * Return: always 0
  */
-int rvt_destroy_cq(struct ib_cq *ibcq)
+int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct rvt_cq *cq = ibcq_to_rvtcq(ibcq);
        struct rvt_dev_info *rdi = cq->rdi;
index 72184b1..3ad6faf 100644 (file)
@@ -53,9 +53,8 @@
 
 struct ib_cq *rvt_create_cq(struct ib_device *ibdev,
                            const struct ib_cq_init_attr *attr,
-                           struct ib_ucontext *context,
                            struct ib_udata *udata);
-int rvt_destroy_cq(struct ib_cq *ibcq);
+int rvt_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata);
 int rvt_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
 int rvt_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
 int rvt_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
index 6b712ee..652f4a7 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <asm/pgtable.h>
+#include <rdma/uverbs_ioctl.h>
 #include "mmap.h"
 
 /**
@@ -150,18 +151,19 @@ done:
  * rvt_create_mmap_info - allocate information for hfi1_mmap
  * @rdi: rvt dev struct
  * @size: size in bytes to map
- * @context: user context
+ * @udata: user data (must be valid!)
  * @obj: opaque pointer to a cq, wq etc
  *
  * Return: rvt_mmap struct on success
  */
-struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
-                                          u32 size,
-                                          struct ib_ucontext *context,
-                                          void *obj)
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+                                          struct ib_udata *udata, void *obj)
 {
        struct rvt_mmap_info *ip;
 
+       if (!udata)
+               return ERR_PTR(-EINVAL);
+
        ip = kmalloc_node(sizeof(*ip), GFP_KERNEL, rdi->dparms.node);
        if (!ip)
                return ip;
@@ -177,7 +179,9 @@ struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
 
        INIT_LIST_HEAD(&ip->pending_mmaps);
        ip->size = size;
-       ip->context = context;
+       ip->context =
+               container_of(udata, struct uverbs_attr_bundle, driver_udata)
+                       ->context;
        ip->obj = obj;
        kref_init(&ip->ref);
 
index fab0e7b..02466c4 100644 (file)
 void rvt_mmap_init(struct rvt_dev_info *rdi);
 void rvt_release_mmap_info(struct kref *ref);
 int rvt_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
-struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi,
-                                          u32 size,
-                                          struct ib_ucontext *context,
-                                          void *obj);
+struct rvt_mmap_info *rvt_create_mmap_info(struct rvt_dev_info *rdi, u32 size,
+                                          struct ib_udata *udata, void *obj);
 void rvt_update_mmap_info(struct rvt_dev_info *rdi, struct rvt_mmap_info *ip,
                          u32 size, void *obj);
 
index 0bb6e39..54f3f9c 100644 (file)
@@ -392,7 +392,7 @@ struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
        if (IS_ERR(umem))
                return (void *)umem;
 
-       n = umem->nmap;
+       n = ib_umem_num_pages(umem);
 
        mr = __rvt_alloc_mr(n, pd);
        if (IS_ERR(mr)) {
@@ -548,7 +548,7 @@ bool rvt_ss_has_lkey(struct rvt_sge_state *ss, u32 lkey)
  *
  * Returns 0 on success.
  */
-int rvt_dereg_mr(struct ib_mr *ibmr)
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct rvt_mr *mr = to_imr(ibmr);
        int ret;
@@ -575,9 +575,8 @@ out:
  *
  * Return: the memory region on success, otherwise return an errno.
  */
-struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
-                          enum ib_mr_type mr_type,
-                          u32 max_num_sg)
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                          u32 max_num_sg, struct ib_udata *udata)
 {
        struct rvt_mr *mr;
 
index 132800e..2c8d075 100644 (file)
@@ -78,10 +78,9 @@ struct ib_mr *rvt_get_dma_mr(struct ib_pd *pd, int acc);
 struct ib_mr *rvt_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
                              u64 virt_addr, int mr_access_flags,
                              struct ib_udata *udata);
-int rvt_dereg_mr(struct ib_mr *ibmr);
-struct ib_mr *rvt_alloc_mr(struct ib_pd *pd,
-                          enum ib_mr_type mr_type,
-                          u32 max_num_sg);
+int rvt_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata);
+struct ib_mr *rvt_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type,
+                          u32 max_num_sg, struct ib_udata *udata);
 int rvt_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg,
                  int sg_nents, unsigned int *sg_offset);
 struct ib_fmr *rvt_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
index 6033054..a403718 100644 (file)
 /**
  * rvt_alloc_pd - allocate a protection domain
  * @ibpd: PD
- * @context: optional user context
  * @udata: optional user data
  *
  * Allocate and keep track of a PD.
  *
  * Return: 0 on success
  */
-int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                struct ib_udata *udata)
+int rvt_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct ib_device *ibdev = ibpd->device;
        struct rvt_dev_info *dev = ib_to_rvt(ibdev);
@@ -93,10 +91,11 @@ bail:
 /**
  * rvt_dealloc_pd - Free PD
  * @ibpd: Free up PD
+ * @udata: Valid user data or NULL for kernel object
  *
  * Return: always 0
  */
-void rvt_dealloc_pd(struct ib_pd *ibpd)
+void rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
 
index 7a887e4..71ba76d 100644 (file)
@@ -50,8 +50,7 @@
 
 #include <rdma/rdma_vt.h>
 
-int rvt_alloc_pd(struct ib_pd *pd, struct ib_ucontext *context,
-                struct ib_udata *udata);
-void rvt_dealloc_pd(struct ib_pd *ibpd);
+int rvt_alloc_pd(struct ib_pd *pd, struct ib_udata *udata);
+void rvt_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata);
 
 #endif          /* DEF_RDMAVTPD_H */
index a34b9a2..31a2e65 100644 (file)
@@ -623,13 +623,7 @@ static void rvt_clear_mr_refs(struct rvt_qp *qp, int clr_sends)
                while (qp->s_last != qp->s_head) {
                        struct rvt_swqe *wqe = rvt_get_swqe_ptr(qp, qp->s_last);
 
-                       rvt_put_swqe(wqe);
-
-                       if (qp->ibqp.qp_type == IB_QPT_UD ||
-                           qp->ibqp.qp_type == IB_QPT_SMI ||
-                           qp->ibqp.qp_type == IB_QPT_GSI)
-                               atomic_dec(&ibah_to_rvtah(
-                                               wqe->ud_wr.ah)->refcount);
+                       rvt_put_qp_swqe(qp, wqe);
                        if (++qp->s_last >= qp->s_size)
                                qp->s_last = 0;
                        smp_wmb(); /* see qp_set_savail */
@@ -957,8 +951,6 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
        size_t sg_list_sz;
        struct ib_qp *ret = ERR_PTR(-ENOMEM);
        struct rvt_dev_info *rdi = ib_to_rvt(ibpd->device);
-       struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
-               udata, struct rvt_ucontext, ibucontext);
        void *priv = NULL;
        size_t sqsize;
 
@@ -1131,8 +1123,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                } else {
                        u32 s = sizeof(struct rvt_rwq) + qp->r_rq.size * sz;
 
-                       qp->ip = rvt_create_mmap_info(rdi, s,
-                                                     &ucontext->ibucontext,
+                       qp->ip = rvt_create_mmap_info(rdi, s, udata,
                                                      qp->r_rq.wq);
                        if (!qp->ip) {
                                ret = ERR_PTR(-ENOMEM);
@@ -1617,7 +1608,7 @@ inval:
  *
  * Return: 0 on success.
  */
-int rvt_destroy_qp(struct ib_qp *ibqp)
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct rvt_qp *qp = ibqp_to_rvtqp(ibqp);
        struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
@@ -2018,8 +2009,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
         * opportunity to adjust PSN values based on internal checks.
         */
        log_pmtu = qp->log_pmtu;
-       if (qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_RC) {
+       if (qp->allowed_ops == IB_OPCODE_UD) {
                struct rvt_ah *ah = ibah_to_rvtah(wqe->ud_wr.ah);
 
                log_pmtu = ah->log_pmtu;
@@ -2067,8 +2057,7 @@ static int rvt_post_one_wr(struct rvt_qp *qp,
        return 0;
 
 bail_inval_free_ref:
-       if (qp->ibqp.qp_type != IB_QPT_UC &&
-           qp->ibqp.qp_type != IB_QPT_RC)
+       if (qp->allowed_ops == IB_OPCODE_UD)
                atomic_dec(&ibah_to_rvtah(ud_wr(wr)->ah)->refcount);
 bail_inval_free:
        /* release mr holds */
@@ -2691,11 +2680,7 @@ void rvt_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
        qp->s_last = last;
        /* See post_send() */
        barrier();
-       rvt_put_swqe(wqe);
-       if (qp->ibqp.qp_type == IB_QPT_UD ||
-           qp->ibqp.qp_type == IB_QPT_SMI ||
-           qp->ibqp.qp_type == IB_QPT_GSI)
-               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+       rvt_put_qp_swqe(qp, wqe);
 
        rvt_qp_swqe_complete(qp,
                             wqe,
index 6d88397..6db1619 100644 (file)
@@ -48,7 +48,7 @@
  *
  */
 
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 int rvt_driver_qp_init(struct rvt_dev_info *rdi);
 void rvt_qp_exit(struct rvt_dev_info *rdi);
@@ -57,7 +57,7 @@ struct ib_qp *rvt_create_qp(struct ib_pd *ibpd,
                            struct ib_udata *udata);
 int rvt_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                  int attr_mask, struct ib_udata *udata);
-int rvt_destroy_qp(struct ib_qp *ibqp);
+int rvt_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata);
 int rvt_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
                 int attr_mask, struct ib_qp_init_attr *init_attr);
 int rvt_post_recv(struct ib_qp *ibqp, const struct ib_recv_wr *wr,
index 8d71647..09f0cf5 100644 (file)
@@ -45,7 +45,7 @@
  *
  */
 
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 #include <rdma/ib_hdrs.h>
 
 /*
index 895b3fa..8d6b3e7 100644 (file)
@@ -71,31 +71,24 @@ void rvt_driver_srq_init(struct rvt_dev_info *rdi)
  * @srq_init_attr: the attributes of the SRQ
  * @udata: data from libibverbs when creating a user SRQ
  *
- * Return: Allocated srq object
+ * Return: 0 on success
  */
-struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
-                             struct ib_srq_init_attr *srq_init_attr,
-                             struct ib_udata *udata)
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+                  struct ib_udata *udata)
 {
-       struct rvt_dev_info *dev = ib_to_rvt(ibpd->device);
-       struct rvt_ucontext *ucontext = rdma_udata_to_drv_context(
-               udata, struct rvt_ucontext, ibucontext);
-       struct rvt_srq *srq;
+       struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
+       struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
        u32 sz;
-       struct ib_srq *ret;
+       int ret;
 
        if (srq_init_attr->srq_type != IB_SRQT_BASIC)
-               return ERR_PTR(-EOPNOTSUPP);
+               return -EOPNOTSUPP;
 
        if (srq_init_attr->attr.max_sge == 0 ||
            srq_init_attr->attr.max_sge > dev->dparms.props.max_srq_sge ||
            srq_init_attr->attr.max_wr == 0 ||
            srq_init_attr->attr.max_wr > dev->dparms.props.max_srq_wr)
-               return ERR_PTR(-EINVAL);
-
-       srq = kzalloc_node(sizeof(*srq), GFP_KERNEL, dev->dparms.node);
-       if (!srq)
-               return ERR_PTR(-ENOMEM);
+               return -EINVAL;
 
        /*
         * Need to use vmalloc() if we want to support large #s of entries.
@@ -109,7 +102,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
                vzalloc_node(sizeof(struct rvt_rwq) + srq->rq.size * sz,
                             dev->dparms.node);
        if (!srq->rq.wq) {
-               ret = ERR_PTR(-ENOMEM);
+               ret = -ENOMEM;
                goto bail_srq;
        }
 
@@ -118,23 +111,18 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
         * See rvt_mmap() for details.
         */
        if (udata && udata->outlen >= sizeof(__u64)) {
-               int err;
                u32 s = sizeof(struct rvt_rwq) + srq->rq.size * sz;
 
-               srq->ip =
-                   rvt_create_mmap_info(dev, s, &ucontext->ibucontext,
-                                        srq->rq.wq);
+               srq->ip = rvt_create_mmap_info(dev, s, udata, srq->rq.wq);
                if (!srq->ip) {
-                       ret = ERR_PTR(-ENOMEM);
+                       ret = -ENOMEM;
                        goto bail_wq;
                }
 
-               err = ib_copy_to_udata(udata, &srq->ip->offset,
+               ret = ib_copy_to_udata(udata, &srq->ip->offset,
                                       sizeof(srq->ip->offset));
-               if (err) {
-                       ret = ERR_PTR(err);
+               if (ret)
                        goto bail_ip;
-               }
        }
 
        /*
@@ -146,7 +134,7 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
        spin_lock(&dev->n_srqs_lock);
        if (dev->n_srqs_allocated == dev->dparms.props.max_srq) {
                spin_unlock(&dev->n_srqs_lock);
-               ret = ERR_PTR(-ENOMEM);
+               ret = -ENOMEM;
                goto bail_ip;
        }
 
@@ -159,14 +147,13 @@ struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
                spin_unlock_irq(&dev->pending_lock);
        }
 
-       return &srq->ibsrq;
+       return 0;
 
 bail_ip:
        kfree(srq->ip);
 bail_wq:
        vfree(srq->rq.wq);
 bail_srq:
-       kfree(srq);
        return ret;
 }
 
@@ -338,9 +325,8 @@ int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
  * rvt_destroy_srq - destory an srq
  * @ibsrq: srq object to destroy
  *
- * Return always 0
  */
-int rvt_destroy_srq(struct ib_srq *ibsrq)
+void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
        struct rvt_srq *srq = ibsrq_to_rvtsrq(ibsrq);
        struct rvt_dev_info *dev = ib_to_rvt(ibsrq->device);
@@ -352,7 +338,4 @@ int rvt_destroy_srq(struct ib_srq *ibsrq)
                kref_put(&srq->ip->ref, rvt_release_mmap_info);
        else
                vfree(srq->rq.wq);
-       kfree(srq);
-
-       return 0;
 }
index bf0eaaf..6427d7d 100644 (file)
 
 #include <rdma/rdma_vt.h>
 void rvt_driver_srq_init(struct rvt_dev_info *rdi);
-struct ib_srq *rvt_create_srq(struct ib_pd *ibpd,
-                             struct ib_srq_init_attr *srq_init_attr,
-                             struct ib_udata *udata);
+int rvt_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *srq_init_attr,
+                  struct ib_udata *udata);
 int rvt_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
                   enum ib_srq_attr_mask attr_mask,
                   struct ib_udata *udata);
 int rvt_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
-int rvt_destroy_srq(struct ib_srq *ibsrq);
+void rvt_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata);
 
 #endif          /* DEF_RVTSRQ_H */
index efc9d81..c32d21c 100644 (file)
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_qp
index 9952769..c47357a 100644 (file)
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_rc
index d5df352..d963ca7 100644 (file)
@@ -51,7 +51,7 @@
 #include <linux/trace_seq.h>
 
 #include <rdma/ib_verbs.h>
-#include <rdma/rdma_vt.h>
+#include <rdma/rdmavt_qp.h>
 
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM rvt_tx
index 42c9d35..9546a83 100644 (file)
@@ -425,7 +425,10 @@ static const struct ib_device_ops rvt_dev_ops = {
        .req_notify_cq = rvt_req_notify_cq,
        .resize_cq = rvt_resize_cq,
        .unmap_fmr = rvt_unmap_fmr,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, rvt_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, rvt_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, rvt_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, rvt_ucontext, ibucontext),
 };
 
index a57276f..ad30901 100644 (file)
@@ -82,7 +82,7 @@ static void rxe_send_complete(unsigned long data)
 }
 
 int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
-                    int comp_vector, struct ib_ucontext *context,
+                    int comp_vector, struct ib_udata *udata,
                     struct rxe_create_cq_resp __user *uresp)
 {
        int err;
@@ -94,7 +94,7 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
                return -ENOMEM;
        }
 
-       err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context,
+       err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata,
                           cq->queue->buf, cq->queue->buf_size, &cq->queue->ip);
        if (err) {
                vfree(cq->queue->buf);
@@ -115,13 +115,13 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
 }
 
 int rxe_cq_resize_queue(struct rxe_cq *cq, int cqe,
-                       struct rxe_resize_cq_resp __user *uresp)
+                       struct rxe_resize_cq_resp __user *uresp,
+                       struct ib_udata *udata)
 {
        int err;
 
        err = rxe_queue_resize(cq->queue, (unsigned int *)&cqe,
-                              sizeof(struct rxe_cqe),
-                              cq->queue->ip ? cq->queue->ip->context : NULL,
+                              sizeof(struct rxe_cqe), udata,
                               uresp ? &uresp->mi : NULL, NULL, &cq->cq_lock);
        if (!err)
                cq->ibcq.cqe = cqe;
index 6cb1840..ce00366 100644 (file)
@@ -643,7 +643,7 @@ struct rxe_atmeth {
        __be32                  rkey;
        __be64                  swap_add;
        __be64                  comp;
-} __attribute__((__packed__));
+} __packed;
 
 static inline u64 __atmeth_va(void *arg)
 {
index 3d8cef8..775c23b 100644 (file)
@@ -53,11 +53,12 @@ int rxe_cq_chk_attr(struct rxe_dev *rxe, struct rxe_cq *cq,
                    int cqe, int comp_vector);
 
 int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe,
-                    int comp_vector, struct ib_ucontext *context,
+                    int comp_vector, struct ib_udata *udata,
                     struct rxe_create_cq_resp __user *uresp);
 
 int rxe_cq_resize_queue(struct rxe_cq *cq, int new_cqe,
-                       struct rxe_resize_cq_resp __user *uresp);
+                       struct rxe_resize_cq_resp __user *uresp,
+                       struct ib_udata *udata);
 
 int rxe_cq_post(struct rxe_cq *cq, struct rxe_cqe *cqe, int solicited);
 
@@ -91,10 +92,8 @@ struct rxe_mmap_info {
 
 void rxe_mmap_release(struct kref *ref);
 
-struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev,
-                                          u32 size,
-                                          struct ib_ucontext *context,
-                                          void *obj);
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *dev, u32 size,
+                                          struct ib_udata *udata, void *obj);
 
 int rxe_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
@@ -224,13 +223,12 @@ int rxe_srq_chk_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
                     struct ib_srq_attr *attr, enum ib_srq_attr_mask mask);
 
 int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
-                     struct ib_srq_init_attr *init,
-                     struct ib_ucontext *context,
+                     struct ib_srq_init_attr *init, struct ib_udata *udata,
                      struct rxe_create_srq_resp __user *uresp);
 
 int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
                      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
-                     struct rxe_modify_srq_cmd *ucmd);
+                     struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata);
 
 void rxe_dealloc(struct ib_device *ib_dev);
 
index d22431e..48f4812 100644 (file)
@@ -36,6 +36,7 @@
 #include <linux/mm.h>
 #include <linux/errno.h>
 #include <asm/pgtable.h>
+#include <rdma/uverbs_ioctl.h>
 
 #include "rxe.h"
 #include "rxe_loc.h"
@@ -140,13 +141,14 @@ done:
 /*
  * Allocate information for rxe_mmap
  */
-struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
-                                          u32 size,
-                                          struct ib_ucontext *context,
-                                          void *obj)
+struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe, u32 size,
+                                          struct ib_udata *udata, void *obj)
 {
        struct rxe_mmap_info *ip;
 
+       if (!udata)
+               return ERR_PTR(-EINVAL);
+
        ip = kmalloc(sizeof(*ip), GFP_KERNEL);
        if (!ip)
                return NULL;
@@ -165,7 +167,9 @@ struct rxe_mmap_info *rxe_create_mmap_info(struct rxe_dev *rxe,
 
        INIT_LIST_HEAD(&ip->pending_mmaps);
        ip->info.size = size;
-       ip->context = context;
+       ip->context =
+               container_of(udata, struct uverbs_attr_bundle, driver_udata)
+                       ->context;
        ip->obj = obj;
        kref_init(&ip->ref);
 
index 42f0f25..f501f72 100644 (file)
@@ -179,7 +179,7 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
        }
 
        mem->umem = umem;
-       num_buf = umem->nmap;
+       num_buf = ib_umem_num_pages(umem);
 
        rxe_mem_init(access, mem);
 
@@ -199,6 +199,12 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
                buf = map[0]->buf;
 
                for_each_sg_page(umem->sg_head.sgl, &sg_iter, umem->nmap, 0) {
+                       if (num_buf >= RXE_BUF_PER_MAP) {
+                               map++;
+                               buf = map[0]->buf;
+                               num_buf = 0;
+                       }
+
                        vaddr = page_address(sg_page_iter_page(&sg_iter));
                        if (!vaddr) {
                                pr_warn("null vaddr\n");
@@ -211,11 +217,6 @@ int rxe_mem_init_user(struct rxe_pd *pd, u64 start,
                        num_buf++;
                        buf++;
 
-                       if (num_buf >= RXE_BUF_PER_MAP) {
-                               map++;
-                               buf = map[0]->buf;
-                               num_buf = 0;
-                       }
                }
        }
 
index 753cabc..5a3474f 100644 (file)
@@ -338,13 +338,13 @@ static void prepare_ipv6_hdr(struct dst_entry *dst, struct sk_buff *skb,
        ip6h->payload_len = htons(skb->len - sizeof(*ip6h));
 }
 
-static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
-                   struct rxe_av *av)
+static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
        struct rxe_qp *qp = pkt->qp;
        struct dst_entry *dst;
        bool xnet = false;
        __be16 df = htons(IP_DF);
+       struct rxe_av *av = rxe_get_av(pkt);
        struct in_addr *saddr = &av->sgid_addr._sockaddr_in.sin_addr;
        struct in_addr *daddr = &av->dgid_addr._sockaddr_in.sin_addr;
 
@@ -364,11 +364,11 @@ static int prepare4(struct rxe_pkt_info *pkt, struct sk_buff *skb,
        return 0;
 }
 
-static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
-                   struct rxe_av *av)
+static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
        struct rxe_qp *qp = pkt->qp;
        struct dst_entry *dst;
+       struct rxe_av *av = rxe_get_av(pkt);
        struct in6_addr *saddr = &av->sgid_addr._sockaddr_in6.sin6_addr;
        struct in6_addr *daddr = &av->dgid_addr._sockaddr_in6.sin6_addr;
 
@@ -392,16 +392,15 @@ static int prepare6(struct rxe_pkt_info *pkt, struct sk_buff *skb,
 int rxe_prepare(struct rxe_pkt_info *pkt, struct sk_buff *skb, u32 *crc)
 {
        int err = 0;
-       struct rxe_av *av = rxe_get_av(pkt);
 
-       if (av->network_type == RDMA_NETWORK_IPV4)
-               err = prepare4(pkt, skb, av);
-       else if (av->network_type == RDMA_NETWORK_IPV6)
-               err = prepare6(pkt, skb, av);
+       if (skb->protocol == htons(ETH_P_IP))
+               err = prepare4(pkt, skb);
+       else if (skb->protocol == htons(ETH_P_IPV6))
+               err = prepare6(pkt, skb);
 
        *crc = rxe_icrc_hdr(pkt, skb);
 
-       if (ether_addr_equal(skb->dev->dev_addr, av->dmac))
+       if (ether_addr_equal(skb->dev->dev_addr, rxe_get_av(pkt)->dmac))
                pkt->mask |= RXE_LOOPBACK_MASK;
 
        return err;
@@ -422,23 +421,20 @@ static void rxe_skb_tx_dtor(struct sk_buff *skb)
 
 int rxe_send(struct rxe_pkt_info *pkt, struct sk_buff *skb)
 {
-       struct rxe_av *av;
        int err;
 
-       av = rxe_get_av(pkt);
-
        skb->destructor = rxe_skb_tx_dtor;
        skb->sk = pkt->qp->sk->sk;
 
        rxe_add_ref(pkt->qp);
        atomic_inc(&pkt->qp->skb_out);
 
-       if (av->network_type == RDMA_NETWORK_IPV4) {
+       if (skb->protocol == htons(ETH_P_IP)) {
                err = ip_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
-       } else if (av->network_type == RDMA_NETWORK_IPV6) {
+       } else if (skb->protocol == htons(ETH_P_IPV6)) {
                err = ip6_local_out(dev_net(skb_dst(skb)->dev), skb->sk, skb);
        } else {
-               pr_err("Unknown layer 3 protocol: %d\n", av->network_type);
+               pr_err("Unknown layer 3 protocol: %d\n", skb->protocol);
                atomic_dec(&pkt->qp->skb_out);
                rxe_drop_ref(pkt->qp);
                kfree_skb(skb);
@@ -462,7 +458,7 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
                                int paylen, struct rxe_pkt_info *pkt)
 {
        unsigned int hdr_len;
-       struct sk_buff *skb;
+       struct sk_buff *skb = NULL;
        struct net_device *ndev;
        const struct ib_gid_attr *attr;
        const int port_num = 1;
@@ -470,7 +466,6 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
        attr = rdma_get_gid_attr(&rxe->ib_dev, port_num, av->grh.sgid_index);
        if (IS_ERR(attr))
                return NULL;
-       ndev = attr->ndev;
 
        if (av->network_type == RDMA_NETWORK_IPV4)
                hdr_len = ETH_HLEN + sizeof(struct udphdr) +
@@ -479,15 +474,26 @@ struct sk_buff *rxe_init_packet(struct rxe_dev *rxe, struct rxe_av *av,
                hdr_len = ETH_HLEN + sizeof(struct udphdr) +
                        sizeof(struct ipv6hdr);
 
+       rcu_read_lock();
+       ndev = rdma_read_gid_attr_ndev_rcu(attr);
+       if (IS_ERR(ndev)) {
+               rcu_read_unlock();
+               goto out;
+       }
        skb = alloc_skb(paylen + hdr_len + LL_RESERVED_SPACE(ndev),
                        GFP_ATOMIC);
 
-       if (unlikely(!skb))
+       if (unlikely(!skb)) {
+               rcu_read_unlock();
                goto out;
+       }
 
-       skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(rxe->ndev));
+       skb_reserve(skb, hdr_len + LL_RESERVED_SPACE(ndev));
 
+       /* FIXME: hold reference to this netdev until life of this skb. */
        skb->dev        = ndev;
+       rcu_read_unlock();
+
        if (av->network_type == RDMA_NETWORK_IPV4)
                skb->protocol = htons(ETH_P_IP);
        else
index 120fa90..56cf18a 100644 (file)
@@ -52,12 +52,12 @@ struct rxe_type_info rxe_type_info[RXE_NUM_TYPES] = {
        [RXE_TYPE_AH] = {
                .name           = "rxe-ah",
                .size           = sizeof(struct rxe_ah),
-               .flags          = RXE_POOL_ATOMIC,
+               .flags          = RXE_POOL_ATOMIC | RXE_POOL_NO_ALLOC,
        },
        [RXE_TYPE_SRQ] = {
                .name           = "rxe-srq",
                .size           = sizeof(struct rxe_srq),
-               .flags          = RXE_POOL_INDEX,
+               .flags          = RXE_POOL_INDEX | RXE_POOL_NO_ALLOC,
                .min_index      = RXE_MIN_SRQ_INDEX,
                .max_index      = RXE_MAX_SRQ_INDEX,
        },
index 09ede70..e2c6d1c 100644 (file)
@@ -217,8 +217,7 @@ static void rxe_qp_init_misc(struct rxe_dev *rxe, struct rxe_qp *qp,
 }
 
 static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
-                          struct ib_qp_init_attr *init,
-                          struct ib_ucontext *context,
+                          struct ib_qp_init_attr *init, struct ib_udata *udata,
                           struct rxe_create_qp_resp __user *uresp)
 {
        int err;
@@ -254,7 +253,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
        if (!qp->sq.queue)
                return -ENOMEM;
 
-       err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, context,
+       err = do_mmap_info(rxe, uresp ? &uresp->sq_mi : NULL, udata,
                           qp->sq.queue->buf, qp->sq.queue->buf_size,
                           &qp->sq.queue->ip);
 
@@ -287,7 +286,7 @@ static int rxe_qp_init_req(struct rxe_dev *rxe, struct rxe_qp *qp,
 
 static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
                            struct ib_qp_init_attr *init,
-                           struct ib_ucontext *context,
+                           struct ib_udata *udata,
                            struct rxe_create_qp_resp __user *uresp)
 {
        int err;
@@ -308,7 +307,7 @@ static int rxe_qp_init_resp(struct rxe_dev *rxe, struct rxe_qp *qp,
                if (!qp->rq.queue)
                        return -ENOMEM;
 
-               err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, context,
+               err = do_mmap_info(rxe, uresp ? &uresp->rq_mi : NULL, udata,
                                   qp->rq.queue->buf, qp->rq.queue->buf_size,
                                   &qp->rq.queue->ip);
                if (err) {
@@ -344,8 +343,6 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
        struct rxe_cq *rcq = to_rcq(init->recv_cq);
        struct rxe_cq *scq = to_rcq(init->send_cq);
        struct rxe_srq *srq = init->srq ? to_rsrq(init->srq) : NULL;
-       struct rxe_ucontext *ucontext =
-               rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
 
        rxe_add_ref(pd);
        rxe_add_ref(rcq);
@@ -360,11 +357,11 @@ int rxe_qp_from_init(struct rxe_dev *rxe, struct rxe_qp *qp, struct rxe_pd *pd,
 
        rxe_qp_init_misc(rxe, qp, init);
 
-       err = rxe_qp_init_req(rxe, qp, init, &ucontext->ibuc, uresp);
+       err = rxe_qp_init_req(rxe, qp, init, udata, uresp);
        if (err)
                goto err1;
 
-       err = rxe_qp_init_resp(rxe, qp, init, &ucontext->ibuc, uresp);
+       err = rxe_qp_init_resp(rxe, qp, init, udata, uresp);
        if (err)
                goto err2;
 
index f84ab44..ff92704 100644 (file)
 #include "rxe_loc.h"
 #include "rxe_queue.h"
 
-int do_mmap_info(struct rxe_dev *rxe,
-                struct mminfo __user *outbuf,
-                struct ib_ucontext *context,
-                struct rxe_queue_buf *buf,
-                size_t buf_size,
-                struct rxe_mmap_info **ip_p)
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+                struct ib_udata *udata, struct rxe_queue_buf *buf,
+                size_t buf_size, struct rxe_mmap_info **ip_p)
 {
        int err;
        struct rxe_mmap_info *ip = NULL;
 
        if (outbuf) {
-               ip = rxe_create_mmap_info(rxe, buf_size, context, buf);
+               ip = rxe_create_mmap_info(rxe, buf_size, udata, buf);
                if (!ip)
                        goto err1;
 
@@ -153,12 +150,9 @@ static int resize_finish(struct rxe_queue *q, struct rxe_queue *new_q,
        return 0;
 }
 
-int rxe_queue_resize(struct rxe_queue *q,
-                    unsigned int *num_elem_p,
-                    unsigned int elem_size,
-                    struct ib_ucontext *context,
-                    struct mminfo __user *outbuf,
-                    spinlock_t *producer_lock,
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+                    unsigned int elem_size, struct ib_udata *udata,
+                    struct mminfo __user *outbuf, spinlock_t *producer_lock,
                     spinlock_t *consumer_lock)
 {
        struct rxe_queue *new_q;
@@ -170,7 +164,7 @@ int rxe_queue_resize(struct rxe_queue *q,
        if (!new_q)
                return -ENOMEM;
 
-       err = do_mmap_info(new_q->rxe, outbuf, context, new_q->buf,
+       err = do_mmap_info(new_q->rxe, outbuf, udata, new_q->buf,
                           new_q->buf_size, &new_q->ip);
        if (err) {
                vfree(new_q->buf);
index 79ba4b3..acd0a92 100644 (file)
@@ -76,12 +76,9 @@ struct rxe_queue {
        unsigned int            index_mask;
 };
 
-int do_mmap_info(struct rxe_dev *rxe,
-                struct mminfo __user *outbuf,
-                struct ib_ucontext *context,
-                struct rxe_queue_buf *buf,
-                size_t buf_size,
-                struct rxe_mmap_info **ip_p);
+int do_mmap_info(struct rxe_dev *rxe, struct mminfo __user *outbuf,
+                struct ib_udata *udata, struct rxe_queue_buf *buf,
+                size_t buf_size, struct rxe_mmap_info **ip_p);
 
 void rxe_queue_reset(struct rxe_queue *q);
 
@@ -89,10 +86,8 @@ struct rxe_queue *rxe_queue_init(struct rxe_dev *rxe,
                                 int *num_elem,
                                 unsigned int elem_size);
 
-int rxe_queue_resize(struct rxe_queue *q,
-                    unsigned int *num_elem_p,
-                    unsigned int elem_size,
-                    struct ib_ucontext *context,
+int rxe_queue_resize(struct rxe_queue *q, unsigned int *num_elem_p,
+                    unsigned int elem_size, struct ib_udata *udata,
                     struct mminfo __user *outbuf,
                     /* Protect producers while resizing queue */
                     spinlock_t *producer_lock,
index c41a5fe..d845943 100644 (file)
@@ -99,8 +99,7 @@ err1:
 }
 
 int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
-                     struct ib_srq_init_attr *init,
-                     struct ib_ucontext *context,
+                     struct ib_srq_init_attr *init, struct ib_udata *udata,
                      struct rxe_create_srq_resp __user *uresp)
 {
        int err;
@@ -128,7 +127,7 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
 
        srq->rq.queue = q;
 
-       err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, context, q->buf,
+       err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, q->buf,
                           q->buf_size, &q->ip);
        if (err) {
                vfree(q->buf);
@@ -149,7 +148,7 @@ int rxe_srq_from_init(struct rxe_dev *rxe, struct rxe_srq *srq,
 
 int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
                      struct ib_srq_attr *attr, enum ib_srq_attr_mask mask,
-                     struct rxe_modify_srq_cmd *ucmd)
+                     struct rxe_modify_srq_cmd *ucmd, struct ib_udata *udata)
 {
        int err;
        struct rxe_queue *q = srq->rq.queue;
@@ -163,11 +162,8 @@ int rxe_srq_from_attr(struct rxe_dev *rxe, struct rxe_srq *srq,
                mi = u64_to_user_ptr(ucmd->mmap_info_addr);
 
                err = rxe_queue_resize(q, &attr->max_wr,
-                                      rcv_wqe_size(srq->rq.max_sge),
-                                      srq->rq.queue->ip ?
-                                               srq->rq.queue->ip->context :
-                                               NULL,
-                                      mi, &srq->rq.producer_lock,
+                                      rcv_wqe_size(srq->rq.max_sge), udata, mi,
+                                      &srq->rq.producer_lock,
                                       &srq->rq.consumer_lock);
                if (err)
                        goto err2;
index 6ecf285..8c3e2a1 100644 (file)
@@ -176,8 +176,7 @@ static int rxe_port_immutable(struct ib_device *dev, u8 port_num,
        return 0;
 }
 
-static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
-                       struct ib_udata *udata)
+static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct rxe_dev *rxe = to_rdev(ibpd->device);
        struct rxe_pd *pd = to_rpd(ibpd);
@@ -185,37 +184,31 @@ static int rxe_alloc_pd(struct ib_pd *ibpd, struct ib_ucontext *context,
        return rxe_add_to_pool(&rxe->pd_pool, &pd->pelem);
 }
 
-static void rxe_dealloc_pd(struct ib_pd *ibpd)
+static void rxe_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
 {
        struct rxe_pd *pd = to_rpd(ibpd);
 
        rxe_drop_ref(pd);
 }
 
-static struct ib_ah *rxe_create_ah(struct ib_pd *ibpd,
-                                  struct rdma_ah_attr *attr,
-                                  u32 flags,
-                                  struct ib_udata *udata)
+static int rxe_create_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr,
+                        u32 flags, struct ib_udata *udata)
 
 {
        int err;
-       struct rxe_dev *rxe = to_rdev(ibpd->device);
-       struct rxe_pd *pd = to_rpd(ibpd);
-       struct rxe_ah *ah;
+       struct rxe_dev *rxe = to_rdev(ibah->device);
+       struct rxe_ah *ah = to_rah(ibah);
 
        err = rxe_av_chk_attr(rxe, attr);
        if (err)
-               return ERR_PTR(err);
-
-       ah = rxe_alloc(&rxe->ah_pool);
-       if (!ah)
-               return ERR_PTR(-ENOMEM);
+               return err;
 
-       rxe_add_ref(pd);
-       ah->pd = pd;
+       err = rxe_add_to_pool(&rxe->ah_pool, &ah->pelem);
+       if (err)
+               return err;
 
        rxe_init_av(attr, &ah->av);
-       return &ah->ibah;
+       return 0;
 }
 
 static int rxe_modify_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
@@ -242,13 +235,11 @@ static int rxe_query_ah(struct ib_ah *ibah, struct rdma_ah_attr *attr)
        return 0;
 }
 
-static int rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
+static void rxe_destroy_ah(struct ib_ah *ibah, u32 flags)
 {
        struct rxe_ah *ah = to_rah(ibah);
 
-       rxe_drop_ref(ah->pd);
        rxe_drop_ref(ah);
-       return 0;
 }
 
 static int post_one_recv(struct rxe_rq *rq, const struct ib_recv_wr *ibwr)
@@ -298,21 +289,18 @@ err1:
        return err;
 }
 
-static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
-                                    struct ib_srq_init_attr *init,
-                                    struct ib_udata *udata)
+static int rxe_create_srq(struct ib_srq *ibsrq, struct ib_srq_init_attr *init,
+                         struct ib_udata *udata)
 {
        int err;
-       struct rxe_dev *rxe = to_rdev(ibpd->device);
-       struct rxe_pd *pd = to_rpd(ibpd);
-       struct rxe_ucontext *ucontext =
-               rdma_udata_to_drv_context(udata, struct rxe_ucontext, ibuc);
-       struct rxe_srq *srq;
+       struct rxe_dev *rxe = to_rdev(ibsrq->device);
+       struct rxe_pd *pd = to_rpd(ibsrq->pd);
+       struct rxe_srq *srq = to_rsrq(ibsrq);
        struct rxe_create_srq_resp __user *uresp = NULL;
 
        if (udata) {
                if (udata->outlen < sizeof(*uresp))
-                       return ERR_PTR(-EINVAL);
+                       return -EINVAL;
                uresp = udata->outbuf;
        }
 
@@ -320,28 +308,24 @@ static struct ib_srq *rxe_create_srq(struct ib_pd *ibpd,
        if (err)
                goto err1;
 
-       srq = rxe_alloc(&rxe->srq_pool);
-       if (!srq) {
-               err = -ENOMEM;
+       err = rxe_add_to_pool(&rxe->srq_pool, &srq->pelem);
+       if (err)
                goto err1;
-       }
 
-       rxe_add_index(srq);
        rxe_add_ref(pd);
        srq->pd = pd;
 
-       err = rxe_srq_from_init(rxe, srq, init, &ucontext->ibuc, uresp);
+       err = rxe_srq_from_init(rxe, srq, init, udata, uresp);
        if (err)
                goto err2;
 
-       return &srq->ibsrq;
+       return 0;
 
 err2:
        rxe_drop_ref(pd);
-       rxe_drop_index(srq);
        rxe_drop_ref(srq);
 err1:
-       return ERR_PTR(err);
+       return err;
 }
 
 static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
@@ -366,7 +350,7 @@ static int rxe_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
        if (err)
                goto err1;
 
-       err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd);
+       err = rxe_srq_from_attr(rxe, srq, attr, mask, &ucmd, udata);
        if (err)
                goto err1;
 
@@ -389,7 +373,7 @@ static int rxe_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
        return 0;
 }
 
-static int rxe_destroy_srq(struct ib_srq *ibsrq)
+static void rxe_destroy_srq(struct ib_srq *ibsrq, struct ib_udata *udata)
 {
        struct rxe_srq *srq = to_rsrq(ibsrq);
 
@@ -397,10 +381,7 @@ static int rxe_destroy_srq(struct ib_srq *ibsrq)
                rxe_queue_cleanup(srq->rq.queue);
 
        rxe_drop_ref(srq->pd);
-       rxe_drop_index(srq);
        rxe_drop_ref(srq);
-
-       return 0;
 }
 
 static int rxe_post_srq_recv(struct ib_srq *ibsrq, const struct ib_recv_wr *wr,
@@ -509,7 +490,7 @@ static int rxe_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
        return 0;
 }
 
-static int rxe_destroy_qp(struct ib_qp *ibqp)
+static int rxe_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
 {
        struct rxe_qp *qp = to_rqp(ibqp);
 
@@ -799,7 +780,6 @@ err1:
 
 static struct ib_cq *rxe_create_cq(struct ib_device *dev,
                                   const struct ib_cq_init_attr *attr,
-                                  struct ib_ucontext *context,
                                   struct ib_udata *udata)
 {
        int err;
@@ -826,8 +806,8 @@ static struct ib_cq *rxe_create_cq(struct ib_device *dev,
                goto err1;
        }
 
-       err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector,
-                              context, uresp);
+       err = rxe_cq_from_init(rxe, cq, attr->cqe, attr->comp_vector, udata,
+                              uresp);
        if (err)
                goto err2;
 
@@ -839,7 +819,7 @@ err1:
        return ERR_PTR(err);
 }
 
-static int rxe_destroy_cq(struct ib_cq *ibcq)
+static int rxe_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
 {
        struct rxe_cq *cq = to_rcq(ibcq);
 
@@ -866,7 +846,7 @@ static int rxe_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
        if (err)
                goto err1;
 
-       err = rxe_cq_resize_queue(cq, cqe, uresp);
+       err = rxe_cq_resize_queue(cq, cqe, uresp, udata);
        if (err)
                goto err1;
 
@@ -990,7 +970,7 @@ err2:
        return ERR_PTR(err);
 }
 
-static int rxe_dereg_mr(struct ib_mr *ibmr)
+static int rxe_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
 {
        struct rxe_mem *mr = to_rmr(ibmr);
 
@@ -1001,9 +981,8 @@ static int rxe_dereg_mr(struct ib_mr *ibmr)
        return 0;
 }
 
-static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd,
-                                 enum ib_mr_type mr_type,
-                                 u32 max_num_sg)
+static struct ib_mr *rxe_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
+                                 u32 max_num_sg, struct ib_udata *udata)
 {
        struct rxe_dev *rxe = to_rdev(ibpd->device);
        struct rxe_pd *pd = to_rpd(ibpd);
@@ -1176,7 +1155,10 @@ static const struct ib_device_ops rxe_dev_ops = {
        .reg_user_mr = rxe_reg_user_mr,
        .req_notify_cq = rxe_req_notify_cq,
        .resize_cq = rxe_resize_cq,
+
+       INIT_RDMA_OBJ_SIZE(ib_ah, rxe_ah, ibah),
        INIT_RDMA_OBJ_SIZE(ib_pd, rxe_pd, ibpd),
+       INIT_RDMA_OBJ_SIZE(ib_srq, rxe_srq, ibsrq),
        INIT_RDMA_OBJ_SIZE(ib_ucontext, rxe_ucontext, ibuc),
 };
 
index 157e51a..e8be7f4 100644 (file)
@@ -71,8 +71,8 @@ struct rxe_pd {
 };
 
 struct rxe_ah {
-       struct rxe_pool_entry   pelem;
        struct ib_ah            ibah;
+       struct rxe_pool_entry   pelem;
        struct rxe_pd           *pd;
        struct rxe_av           av;
 };
@@ -120,8 +120,8 @@ struct rxe_rq {
 };
 
 struct rxe_srq {
-       struct rxe_pool_entry   pelem;
        struct ib_srq           ibsrq;
+       struct rxe_pool_entry   pelem;
        struct rxe_pd           *pd;
        struct rxe_rq           rq;
        u32                     srq_num;
index 48eda16..9b5e11d 100644 (file)
@@ -2402,7 +2402,18 @@ static ssize_t dev_id_show(struct device *dev,
 {
        struct net_device *ndev = to_net_dev(dev);
 
-       if (ndev->dev_id == ndev->dev_port)
+       /*
+        * ndev->dev_port will be equal to 0 in old kernel prior to commit
+        * 9b8b2a323008 ("IB/ipoib: Use dev_port to expose network interface
+        * port numbers") Zero was chosen as special case for user space
+        * applications to fallback and query dev_id to check if it has
+        * different value or not.
+        *
+        * Don't print warning in such scenario.
+        *
+        * https://github.com/systemd/systemd/blob/master/src/udev/udev-builtin-net_id.c#L358
+        */
+       if (ndev->dev_port && ndev->dev_id == ndev->dev_port)
                netdev_info_once(ndev,
                        "\"%s\" wants to know my dev_id. Should it look at dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more info.\n",
                        current->comm);
index 1e88213..ba09068 100644 (file)
@@ -279,8 +279,7 @@ void ipoib_event(struct ib_event_handler *handler,
        ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
                  dev_name(&record->device->dev), record->element.port_num);
 
-       if (record->event == IB_EVENT_SM_CHANGE ||
-           record->event == IB_EVENT_CLIENT_REREGISTER) {
+       if (record->event == IB_EVENT_CLIENT_REREGISTER) {
                queue_work(ipoib_workqueue, &priv->flush_light);
        } else if (record->event == IB_EVENT_PORT_ERR ||
                   record->event == IB_EVENT_PORT_ACTIVE ||
index d00af71..299268f 100644 (file)
@@ -4,8 +4,8 @@ config INFINIBAND_ISER
        select SCSI_ISCSI_ATTRS
        ---help---
          Support for the iSCSI Extensions for RDMA (iSER) Protocol
-          over InfiniBand. This allows you to access storage devices
-          that speak iSCSI over iSER over InfiniBand.
+         over InfiniBand. This allows you to access storage devices
+         that speak iSCSI over iSER over InfiniBand.
 
          The iSER protocol is defined by IETF.
          See <http://www.ietf.org/rfc/rfc5046.txt>
index 8c707ac..9c185a8 100644 (file)
@@ -763,7 +763,6 @@ static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
                                   enum iscsi_param param, char *buf)
 {
        struct iser_conn *iser_conn = ep->dd_data;
-       int len;
 
        switch (param) {
        case ISCSI_PARAM_CONN_PORT:
@@ -774,12 +773,10 @@ static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
                return iscsi_conn_get_addr_param((struct sockaddr_storage *)
                                &iser_conn->ib_conn.cma_id->route.addr.dst_addr,
                                param, buf);
-               break;
        default:
-               return -ENOSYS;
+               break;
        }
-
-       return len;
+       return -ENOSYS;
 }
 
 /**
index a7aeaa0..36d5251 100644 (file)
@@ -311,7 +311,7 @@ struct iser_login_desc {
        u64                          rsp_dma;
        struct ib_sge                sge;
        struct ib_cqe                cqe;
-} __attribute__((packed));
+} __packed;
 
 struct iser_conn;
 struct ib_conn;
index 560e4f2..be5befd 100644 (file)
@@ -51,6 +51,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/xarray.h>
 #include <rdma/ib_addr.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/opa_smi.h>
@@ -97,7 +98,7 @@ const char opa_vnic_driver_version[] = DRV_VERSION;
  * @class_port_info: Class port info information.
  * @tid: Transaction id
  * @port_num: OPA port number
- * @vport_idr: vnic ports idr
+ * @vports: vnic ports
  * @event_handler: ib event handler
  * @lock: adapter interface lock
  */
@@ -107,7 +108,7 @@ struct opa_vnic_vema_port {
        struct opa_class_port_info      class_port_info;
        u64                             tid;
        u8                              port_num;
-       struct idr                      vport_idr;
+       struct xarray                   vports;
        struct ib_event_handler         event_handler;
 
        /* Lock to query/update network adapter */
@@ -148,7 +149,7 @@ vema_get_vport_adapter(struct opa_vnic_vema_mad *recvd_mad,
 {
        u8 vport_num = vema_get_vport_num(recvd_mad);
 
-       return idr_find(&port->vport_idr, vport_num);
+       return xa_load(&port->vports, vport_num);
 }
 
 /**
@@ -207,8 +208,7 @@ static struct opa_vnic_adapter *vema_add_vport(struct opa_vnic_vema_port *port,
                int rc;
 
                adapter->cport = cport;
-               rc = idr_alloc(&port->vport_idr, adapter, vport_num,
-                              vport_num + 1, GFP_NOWAIT);
+               rc = xa_insert(&port->vports, vport_num, adapter, GFP_KERNEL);
                if (rc < 0) {
                        opa_vnic_rem_netdev(adapter);
                        adapter = ERR_PTR(rc);
@@ -853,36 +853,14 @@ err_exit:
        v_err("Aborting trap\n");
 }
 
-static int vema_rem_vport(int id, void *p, void *data)
-{
-       struct opa_vnic_adapter *adapter = p;
-
-       opa_vnic_rem_netdev(adapter);
-       return 0;
-}
-
-static int vema_enable_vport(int id, void *p, void *data)
-{
-       struct opa_vnic_adapter *adapter = p;
-
-       netif_carrier_on(adapter->netdev);
-       return 0;
-}
-
-static int vema_disable_vport(int id, void *p, void *data)
-{
-       struct opa_vnic_adapter *adapter = p;
-
-       netif_carrier_off(adapter->netdev);
-       return 0;
-}
-
 static void opa_vnic_event(struct ib_event_handler *handler,
                           struct ib_event *record)
 {
        struct opa_vnic_vema_port *port =
                container_of(handler, struct opa_vnic_vema_port, event_handler);
        struct opa_vnic_ctrl_port *cport = port->cport;
+       struct opa_vnic_adapter *adapter;
+       unsigned long index;
 
        if (record->element.port_num != port->port_num)
                return;
@@ -891,10 +869,16 @@ static void opa_vnic_event(struct ib_event_handler *handler,
              record->event, dev_name(&record->device->dev),
              record->element.port_num);
 
-       if (record->event == IB_EVENT_PORT_ERR)
-               idr_for_each(&port->vport_idr, vema_disable_vport, NULL);
-       if (record->event == IB_EVENT_PORT_ACTIVE)
-               idr_for_each(&port->vport_idr, vema_enable_vport, NULL);
+       if (record->event != IB_EVENT_PORT_ERR &&
+           record->event != IB_EVENT_PORT_ACTIVE)
+               return;
+
+       xa_for_each(&port->vports, index, adapter) {
+               if (record->event == IB_EVENT_PORT_ACTIVE)
+                       netif_carrier_on(adapter->netdev);
+               else
+                       netif_carrier_off(adapter->netdev);
+       }
 }
 
 /**
@@ -905,6 +889,8 @@ static void opa_vnic_event(struct ib_event_handler *handler,
  */
 static void vema_unregister(struct opa_vnic_ctrl_port *cport)
 {
+       struct opa_vnic_adapter *adapter;
+       unsigned long index;
        int i;
 
        for (i = 1; i <= cport->num_ports; i++) {
@@ -915,13 +901,14 @@ static void vema_unregister(struct opa_vnic_ctrl_port *cport)
 
                /* Lock ensures no MAD is being processed */
                mutex_lock(&port->lock);
-               idr_for_each(&port->vport_idr, vema_rem_vport, NULL);
+               xa_for_each(&port->vports, index, adapter)
+                       opa_vnic_rem_netdev(adapter);
                mutex_unlock(&port->lock);
 
                ib_unregister_mad_agent(port->mad_agent);
                port->mad_agent = NULL;
                mutex_destroy(&port->lock);
-               idr_destroy(&port->vport_idr);
+               xa_destroy(&port->vports);
                ib_unregister_event_handler(&port->event_handler);
        }
 }
@@ -958,7 +945,7 @@ static int vema_register(struct opa_vnic_ctrl_port *cport)
                                      cport->ibdev, opa_vnic_event);
                ib_register_event_handler(&port->event_handler);
 
-               idr_init(&port->vport_idr);
+               xa_init(&port->vports);
                mutex_init(&port->lock);
                port->mad_agent = ib_register_mad_agent(cport->ibdev, i,
                                                        IB_QPT_GSI, &reg_req,
@@ -969,7 +956,6 @@ static int vema_register(struct opa_vnic_ctrl_port *cport)
                        ret = PTR_ERR(port->mad_agent);
                        port->mad_agent = NULL;
                        mutex_destroy(&port->lock);
-                       idr_destroy(&port->vport_idr);
                        vema_unregister(cport);
                        return ret;
                }
index ca0ee99..0059b29 100644 (file)
@@ -535,23 +535,16 @@ void mlx5_init_clock(struct mlx5_core_dev *mdev)
        do_div(ns, NSEC_PER_SEC / HZ);
        clock->overflow_period = ns;
 
-       mdev->clock_info_page = alloc_page(GFP_KERNEL);
-       if (mdev->clock_info_page) {
-               mdev->clock_info = kmap(mdev->clock_info_page);
-               if (!mdev->clock_info) {
-                       __free_page(mdev->clock_info_page);
-                       mlx5_core_warn(mdev, "failed to map clock page\n");
-               } else {
-                       mdev->clock_info->sign   = 0;
-                       mdev->clock_info->nsec   = clock->tc.nsec;
-                       mdev->clock_info->cycles = clock->tc.cycle_last;
-                       mdev->clock_info->mask   = clock->cycles.mask;
-                       mdev->clock_info->mult   = clock->nominal_c_mult;
-                       mdev->clock_info->shift  = clock->cycles.shift;
-                       mdev->clock_info->frac   = clock->tc.frac;
-                       mdev->clock_info->overflow_period =
-                                               clock->overflow_period;
-               }
+       mdev->clock_info =
+               (struct mlx5_ib_clock_info *)get_zeroed_page(GFP_KERNEL);
+       if (mdev->clock_info) {
+               mdev->clock_info->nsec = clock->tc.nsec;
+               mdev->clock_info->cycles = clock->tc.cycle_last;
+               mdev->clock_info->mask = clock->cycles.mask;
+               mdev->clock_info->mult = clock->nominal_c_mult;
+               mdev->clock_info->shift = clock->cycles.shift;
+               mdev->clock_info->frac = clock->tc.frac;
+               mdev->clock_info->overflow_period = clock->overflow_period;
        }
 
        INIT_WORK(&clock->pps_info.out_work, mlx5_pps_out);
@@ -599,8 +592,7 @@ void mlx5_cleanup_clock(struct mlx5_core_dev *mdev)
        cancel_delayed_work_sync(&clock->overflow_work);
 
        if (mdev->clock_info) {
-               kunmap(mdev->clock_info_page);
-               __free_page(mdev->clock_info_page);
+               free_page((unsigned long)mdev->clock_info);
                mdev->clock_info = NULL;
        }
 
index c2be029..6c80944 100644 (file)
@@ -71,6 +71,13 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
                          const struct net_device *dev,
                          const char *fmt, ...);
 
+struct ib_device;
+
+extern __printf(3, 4)
+void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
+                        const struct ib_device *ibdev,
+                        const char *fmt, ...);
+
 #define DEFINE_DYNAMIC_DEBUG_METADATA(name, fmt)               \
        static struct _ddebug  __aligned(8)                     \
        __attribute__((section("__verbose"))) name = {          \
@@ -154,6 +161,10 @@ void __dynamic_netdev_dbg(struct _ddebug *descriptor,
        _dynamic_func_call(fmt, __dynamic_netdev_dbg,           \
                           dev, fmt, ##__VA_ARGS__)
 
+#define dynamic_ibdev_dbg(dev, fmt, ...)                       \
+       _dynamic_func_call(fmt, __dynamic_ibdev_dbg,            \
+                          dev, fmt, ##__VA_ARGS__)
+
 #define dynamic_hex_dump(prefix_str, prefix_type, rowsize,             \
                         groupsize, buf, len, ascii)                    \
        _dynamic_func_call_no_desc(__builtin_constant_p(prefix_str) ? prefix_str : "hexdump", \
index 5a39b32..5a27246 100644 (file)
@@ -689,7 +689,6 @@ struct mlx5_core_dev {
 #endif
        struct mlx5_clock        clock;
        struct mlx5_ib_clock_info  *clock_info;
-       struct page             *clock_info_page;
        struct mlx5_fw_tracer   *tracer;
 };
 
index 40b48e2..15eb85d 100644 (file)
 #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T)))
 #define type_min(T) ((T)((T)-type_max(T)-(T)1))
 
+/*
+ * Avoids triggering -Wtype-limits compilation warning,
+ * while using unsigned data types to check a < 0.
+ */
+#define is_non_negative(a) ((a) > 0 || (a) == 0)
+#define is_negative(a) (!(is_non_negative(a)))
 
 #ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW
 /*
        typeof(d) _d = d;                                               \
        u64 _a_full = _a;                                               \
        unsigned int _to_shift =                                        \
-               _s >= 0 && _s < 8 * sizeof(*d) ? _s : 0;                \
+               is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0;    \
        *_d = (_a_full << _to_shift);                                   \
-       (_to_shift != _s || *_d < 0 || _a < 0 ||                        \
-               (*_d >> _to_shift) != _a);                              \
+       (_to_shift != _s || is_negative(*_d) || is_negative(_a) ||      \
+       (*_d >> _to_shift) != _a);                                      \
 })
 
 /**
index b4be960..30a9a55 100644 (file)
@@ -340,11 +340,11 @@ int sg_alloc_table_chained(struct sg_table *table, int nents,
  * sg page iterator
  *
  * Iterates over sg entries page-by-page.  On each successful iteration, you
- * can call sg_page_iter_page(@piter) to get the current page and its dma
- * address. @piter->sg will point to the sg holding this page and
- * @piter->sg_pgoffset to the page's page offset within the sg. The iteration
- * will stop either when a maximum number of sg entries was reached or a
- * terminating sg (sg_last(sg) == true) was reached.
+ * can call sg_page_iter_page(@piter) to get the current page.
+ * @piter->sg will point to the sg holding this page and @piter->sg_pgoffset to
+ * the page's page offset within the sg. The iteration will stop either when a
+ * maximum number of sg entries was reached or a terminating sg
+ * (sg_last(sg) == true) was reached.
  */
 struct sg_page_iter {
        struct scatterlist      *sg;            /* sg holding the page */
index 62e990b..870b5e6 100644 (file)
@@ -54,6 +54,10 @@ const struct ib_gid_attr *rdma_find_gid_by_filter(
                       void *),
        void *context);
 
+int rdma_read_gid_l2_fields(const struct ib_gid_attr *attr,
+                           u16 *vlan_id, u8 *smac);
+struct net_device *rdma_read_gid_attr_ndev_rcu(const struct ib_gid_attr *attr);
+
 /**
  * ib_get_cached_pkey - Returns a cached PKey table entry
  * @device: The device to query.
index 79ba821..eea946f 100644 (file)
@@ -198,7 +198,7 @@ struct ib_sa_hdr {
        __be16                  attr_offset;
        __be16                  reserved;
        ib_sa_comp_mask         comp_mask;
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_mad {
        struct ib_mad_hdr       mad_hdr;
@@ -227,7 +227,7 @@ struct ib_sa_mad {
        struct ib_rmpp_hdr      rmpp_hdr;
        struct ib_sa_hdr        sa_hdr;
        u8                      data[IB_MGMT_SA_DATA];
-} __attribute__ ((packed));
+} __packed;
 
 struct ib_vendor_mad {
        struct ib_mad_hdr       mad_hdr;
index b439e98..7be0028 100644 (file)
@@ -61,7 +61,7 @@ struct ib_smp {
        u8      data[IB_SMP_DATA_SIZE];
        u8      initial_path[IB_SMP_MAX_PATH_HOPS];
        u8      return_path[IB_SMP_MAX_PATH_HOPS];
-} __attribute__ ((packed));
+} __packed;
 
 #define IB_SMP_DIRECTION                       cpu_to_be16(0x8000)
 
index 73af05d..040d853 100644 (file)
@@ -48,12 +48,11 @@ struct ib_umem {
        unsigned long           address;
        int                     page_shift;
        u32 writable : 1;
-       u32 hugetlb : 1;
        u32 is_odp : 1;
        struct work_struct      work;
        struct sg_table sg_head;
        int             nmap;
-       int             npages;
+       unsigned int    sg_nents;
 };
 
 /* Returns the offset of the umem start relative to the first page. */
@@ -87,6 +86,9 @@ void ib_umem_release(struct ib_umem *umem);
 int ib_umem_page_count(struct ib_umem *umem);
 int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
                      size_t length);
+unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem,
+                                    unsigned long pgsz_bitmap,
+                                    unsigned long virt);
 
 #else /* CONFIG_INFINIBAND_USER_MEM */
 
@@ -104,6 +106,12 @@ static inline int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offs
                                    size_t length) {
        return -EINVAL;
 }
+static inline int ib_umem_find_best_pgsz(struct ib_umem *umem,
+                                        unsigned long pgsz_bitmap,
+                                        unsigned long virt) {
+       return -EINVAL;
+}
+
 #endif /* CONFIG_INFINIBAND_USER_MEM */
 
 #endif /* IB_UMEM_H */
index dadc96d..eeec4e5 100644 (file)
@@ -69,6 +69,7 @@ struct ib_umem_odp {
 
        int notifiers_seq;
        int notifiers_count;
+       int npages;
 
        /* Tree tracking */
        struct umem_odp_node    interval_tree;
index 9b9e17b..0742095 100644 (file)
@@ -59,6 +59,8 @@
 #include <linux/mmu_notifier.h>
 #include <linux/uaccess.h>
 #include <linux/cgroup_rdma.h>
+#include <linux/irqflags.h>
+#include <linux/preempt.h>
 #include <uapi/rdma/ib_user_verbs.h>
 #include <rdma/restrack.h>
 #include <uapi/rdma/rdma_user_ioctl.h>
@@ -72,6 +74,36 @@ extern struct workqueue_struct *ib_wq;
 extern struct workqueue_struct *ib_comp_wq;
 extern struct workqueue_struct *ib_comp_unbound_wq;
 
+__printf(3, 4) __cold
+void ibdev_printk(const char *level, const struct ib_device *ibdev,
+                 const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_emerg(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_alert(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_crit(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_err(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_warn(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_notice(const struct ib_device *ibdev, const char *format, ...);
+__printf(2, 3) __cold
+void ibdev_info(const struct ib_device *ibdev, const char *format, ...);
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+#define ibdev_dbg(__dev, format, args...)                       \
+       dynamic_ibdev_dbg(__dev, format, ##args)
+#elif defined(DEBUG)
+#define ibdev_dbg(__dev, format, args...)                       \
+       ibdev_printk(KERN_DEBUG, __dev, format, ##args)
+#else
+__printf(2, 3) __cold
+static inline
+void ibdev_dbg(const struct ib_device *ibdev, const char *format, ...) {}
+#endif
+
 union ib_gid {
        u8      raw[16];
        struct {
@@ -92,7 +124,7 @@ enum ib_gid_type {
 
 #define ROCE_V2_UDP_DPORT      4791
 struct ib_gid_attr {
-       struct net_device       *ndev;
+       struct net_device __rcu *ndev;
        struct ib_device        *device;
        union ib_gid            gid;
        enum ib_gid_type        gid_type;
@@ -108,6 +140,7 @@ enum rdma_node_type {
        RDMA_NODE_RNIC,
        RDMA_NODE_USNIC,
        RDMA_NODE_USNIC_UDP,
+       RDMA_NODE_UNSPECIFIED,
 };
 
 enum {
@@ -119,7 +152,8 @@ enum rdma_transport_type {
        RDMA_TRANSPORT_IB,
        RDMA_TRANSPORT_IWARP,
        RDMA_TRANSPORT_USNIC,
-       RDMA_TRANSPORT_USNIC_UDP
+       RDMA_TRANSPORT_USNIC_UDP,
+       RDMA_TRANSPORT_UNSPECIFIED,
 };
 
 enum rdma_protocol_type {
@@ -2189,8 +2223,6 @@ struct ib_cache {
        struct ib_event_handler event_handler;
 };
 
-struct iw_cm_verbs;
-
 struct ib_port_immutable {
        int                           pkey_tbl_len;
        int                           gid_tbl_len;
@@ -2272,6 +2304,8 @@ struct ib_counters_read_attr {
 };
 
 struct uverbs_attr_bundle;
+struct iw_cm_id;
+struct iw_cm_conn_param;
 
 #define INIT_RDMA_OBJ_SIZE(ib_struct, drv_struct, member)                      \
        .size_##ib_struct =                                                    \
@@ -2281,8 +2315,11 @@ struct uverbs_attr_bundle;
                         !__same_type(((struct drv_struct *)NULL)->member,     \
                                      struct ib_struct)))
 
+#define rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, gfp)                         \
+       ((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, gfp))
+
 #define rdma_zalloc_drv_obj(ib_dev, ib_type)                                   \
-       ((struct ib_type *)kzalloc(ib_dev->ops.size_##ib_type, GFP_KERNEL))
+       rdma_zalloc_drv_obj_gfp(ib_dev, ib_type, GFP_KERNEL)
 
 #define DECLARE_RDMA_OBJ_SIZE(ib_struct) size_t size_##ib_struct
 
@@ -2394,23 +2431,21 @@ struct ib_device_ops {
        void (*dealloc_ucontext)(struct ib_ucontext *context);
        int (*mmap)(struct ib_ucontext *context, struct vm_area_struct *vma);
        void (*disassociate_ucontext)(struct ib_ucontext *ibcontext);
-       int (*alloc_pd)(struct ib_pd *pd, struct ib_ucontext *context,
-                       struct ib_udata *udata);
-       void (*dealloc_pd)(struct ib_pd *pd);
-       struct ib_ah *(*create_ah)(struct ib_pd *pd,
-                                  struct rdma_ah_attr *ah_attr, u32 flags,
-                                  struct ib_udata *udata);
+       int (*alloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
+       void (*dealloc_pd)(struct ib_pd *pd, struct ib_udata *udata);
+       int (*create_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr,
+                        u32 flags, struct ib_udata *udata);
        int (*modify_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
        int (*query_ah)(struct ib_ah *ah, struct rdma_ah_attr *ah_attr);
-       int (*destroy_ah)(struct ib_ah *ah, u32 flags);
-       struct ib_srq *(*create_srq)(struct ib_pd *pd,
-                                    struct ib_srq_init_attr *srq_init_attr,
-                                    struct ib_udata *udata);
+       void (*destroy_ah)(struct ib_ah *ah, u32 flags);
+       int (*create_srq)(struct ib_srq *srq,
+                         struct ib_srq_init_attr *srq_init_attr,
+                         struct ib_udata *udata);
        int (*modify_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr,
                          enum ib_srq_attr_mask srq_attr_mask,
                          struct ib_udata *udata);
        int (*query_srq)(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
-       int (*destroy_srq)(struct ib_srq *srq);
+       void (*destroy_srq)(struct ib_srq *srq, struct ib_udata *udata);
        struct ib_qp *(*create_qp)(struct ib_pd *pd,
                                   struct ib_qp_init_attr *qp_init_attr,
                                   struct ib_udata *udata);
@@ -2418,13 +2453,12 @@ struct ib_device_ops {
                         int qp_attr_mask, struct ib_udata *udata);
        int (*query_qp)(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
                        int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
-       int (*destroy_qp)(struct ib_qp *qp);
+       int (*destroy_qp)(struct ib_qp *qp, struct ib_udata *udata);
        struct ib_cq *(*create_cq)(struct ib_device *device,
                                   const struct ib_cq_init_attr *attr,
-                                  struct ib_ucontext *context,
                                   struct ib_udata *udata);
        int (*modify_cq)(struct ib_cq *cq, u16 cq_count, u16 cq_period);
-       int (*destroy_cq)(struct ib_cq *cq);
+       int (*destroy_cq)(struct ib_cq *cq, struct ib_udata *udata);
        int (*resize_cq)(struct ib_cq *cq, int cqe, struct ib_udata *udata);
        struct ib_mr *(*get_dma_mr)(struct ib_pd *pd, int mr_access_flags);
        struct ib_mr *(*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length,
@@ -2433,9 +2467,9 @@ struct ib_device_ops {
        int (*rereg_user_mr)(struct ib_mr *mr, int flags, u64 start, u64 length,
                             u64 virt_addr, int mr_access_flags,
                             struct ib_pd *pd, struct ib_udata *udata);
-       int (*dereg_mr)(struct ib_mr *mr);
+       int (*dereg_mr)(struct ib_mr *mr, struct ib_udata *udata);
        struct ib_mr *(*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type,
-                                 u32 max_num_sg);
+                                 u32 max_num_sg, struct ib_udata *udata);
        int (*advise_mr)(struct ib_pd *pd,
                         enum ib_uverbs_advise_mr_advice advice, u32 flags,
                         struct ib_sge *sg_list, u32 num_sge,
@@ -2456,9 +2490,8 @@ struct ib_device_ops {
        int (*attach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
        int (*detach_mcast)(struct ib_qp *qp, union ib_gid *gid, u16 lid);
        struct ib_xrcd *(*alloc_xrcd)(struct ib_device *device,
-                                     struct ib_ucontext *ucontext,
                                      struct ib_udata *udata);
-       int (*dealloc_xrcd)(struct ib_xrcd *xrcd);
+       int (*dealloc_xrcd)(struct ib_xrcd *xrcd, struct ib_udata *udata);
        struct ib_flow *(*create_flow)(struct ib_qp *qp,
                                       struct ib_flow_attr *flow_attr,
                                       int domain, struct ib_udata *udata);
@@ -2483,7 +2516,7 @@ struct ib_device_ops {
        struct ib_wq *(*create_wq)(struct ib_pd *pd,
                                   struct ib_wq_init_attr *init_attr,
                                   struct ib_udata *udata);
-       int (*destroy_wq)(struct ib_wq *wq);
+       int (*destroy_wq)(struct ib_wq *wq, struct ib_udata *udata);
        int (*modify_wq)(struct ib_wq *wq, struct ib_wq_attr *attr,
                         u32 wq_attr_mask, struct ib_udata *udata);
        struct ib_rwq_ind_table *(*create_rwq_ind_table)(
@@ -2495,7 +2528,7 @@ struct ib_device_ops {
                                  struct ib_ucontext *context,
                                  struct ib_dm_alloc_attr *attr,
                                  struct uverbs_attr_bundle *attrs);
-       int (*dealloc_dm)(struct ib_dm *dm);
+       int (*dealloc_dm)(struct ib_dm *dm, struct uverbs_attr_bundle *attrs);
        struct ib_mr *(*reg_dm_mr)(struct ib_pd *pd, struct ib_dm *dm,
                                   struct ib_dm_mr_attr *attr,
                                   struct uverbs_attr_bundle *attrs);
@@ -2550,12 +2583,37 @@ struct ib_device_ops {
         */
        void (*dealloc_driver)(struct ib_device *dev);
 
+       /* iWarp CM callbacks */
+       void (*iw_add_ref)(struct ib_qp *qp);
+       void (*iw_rem_ref)(struct ib_qp *qp);
+       struct ib_qp *(*iw_get_qp)(struct ib_device *device, int qpn);
+       int (*iw_connect)(struct iw_cm_id *cm_id,
+                         struct iw_cm_conn_param *conn_param);
+       int (*iw_accept)(struct iw_cm_id *cm_id,
+                        struct iw_cm_conn_param *conn_param);
+       int (*iw_reject)(struct iw_cm_id *cm_id, const void *pdata,
+                        u8 pdata_len);
+       int (*iw_create_listen)(struct iw_cm_id *cm_id, int backlog);
+       int (*iw_destroy_listen)(struct iw_cm_id *cm_id);
+
+       DECLARE_RDMA_OBJ_SIZE(ib_ah);
        DECLARE_RDMA_OBJ_SIZE(ib_pd);
+       DECLARE_RDMA_OBJ_SIZE(ib_srq);
        DECLARE_RDMA_OBJ_SIZE(ib_ucontext);
 };
 
-struct rdma_restrack_root;
+struct ib_core_device {
+       /* device must be the first element in structure until,
+        * union of ib_core_device and device exists in ib_device.
+        */
+       struct device dev;
+       possible_net_t rdma_net;
+       struct kobject *ports_kobj;
+       struct list_head port_list;
+       struct ib_device *owner; /* reach back to owner ib_device */
+};
 
+struct rdma_restrack_root;
 struct ib_device {
        /* Do not access @dma_device directly from ULP nor from HW drivers. */
        struct device                *dma_device;
@@ -2578,19 +2636,18 @@ struct ib_device {
 
        int                           num_comp_vectors;
 
-       struct iw_cm_verbs           *iwcm;
-
        struct module               *owner;
-       struct device                dev;
+       union {
+               struct device           dev;
+               struct ib_core_device   coredev;
+       };
+
        /* First group for device attributes,
         * Second group for driver provided attributes (optional).
         * It is NULL terminated array.
         */
        const struct attribute_group    *groups[3];
 
-       struct kobject                  *ports_kobj;
-       struct list_head             port_list;
-
        int                          uverbs_abi_ver;
        u64                          uverbs_cmd_mask;
        u64                          uverbs_ex_cmd_mask;
@@ -2626,6 +2683,15 @@ struct ib_device {
        struct work_struct unregistration_work;
 
        const struct rdma_link_ops *link_ops;
+
+       /* Protects compat_devs xarray modifications */
+       struct mutex compat_devs_mutex;
+       /* Maintains compat devices for each net namespace */
+       struct xarray compat_devs;
+
+       /* Used by iWarp CM */
+       char iw_ifname[IFNAMSIZ];
+       u32 iw_driver_flags;
 };
 
 struct ib_client {
@@ -2662,6 +2728,21 @@ struct ib_client {
        u8 no_kverbs_req:1;
 };
 
+/*
+ * IB block DMA iterator
+ *
+ * Iterates the DMA-mapped SGL in contiguous memory blocks aligned
+ * to a HW supported page size.
+ */
+struct ib_block_iter {
+       /* internal states */
+       struct scatterlist *__sg;       /* sg holding the current aligned block */
+       dma_addr_t __dma_addr;          /* unaligned DMA address of this block */
+       unsigned int __sg_nents;        /* number of SG entries */
+       unsigned int __sg_advance;      /* number of bytes to advance in sg in next step */
+       unsigned int __pg_bit;          /* alignment of current block */
+};
+
 struct ib_device *_ib_alloc_device(size_t size);
 #define ib_alloc_device(drv_struct, member)                                    \
        container_of(_ib_alloc_device(sizeof(struct drv_struct) +              \
@@ -2682,6 +2763,38 @@ void ib_unregister_device_queued(struct ib_device *ib_dev);
 int ib_register_client   (struct ib_client *client);
 void ib_unregister_client(struct ib_client *client);
 
+void __rdma_block_iter_start(struct ib_block_iter *biter,
+                            struct scatterlist *sglist,
+                            unsigned int nents,
+                            unsigned long pgsz);
+bool __rdma_block_iter_next(struct ib_block_iter *biter);
+
+/**
+ * rdma_block_iter_dma_address - get the aligned dma address of the current
+ * block held by the block iterator.
+ * @biter: block iterator holding the memory block
+ */
+static inline dma_addr_t
+rdma_block_iter_dma_address(struct ib_block_iter *biter)
+{
+       return biter->__dma_addr & ~(BIT_ULL(biter->__pg_bit) - 1);
+}
+
+/**
+ * rdma_for_each_block - iterate over contiguous memory blocks of the sg list
+ * @sglist: sglist to iterate over
+ * @biter: block iterator holding the memory block
+ * @nents: maximum number of sg entries to iterate over
+ * @pgsz: best HW supported page size to use
+ *
+ * Callers may use rdma_block_iter_dma_address() to get each
+ * blocks aligned DMA address.
+ */
+#define rdma_for_each_block(sglist, biter, nents, pgsz)                \
+       for (__rdma_block_iter_start(biter, sglist, nents,      \
+                                    pgsz);                     \
+            __rdma_block_iter_next(biter);)
+
 /**
  * ib_get_client_data - Get IB client context
  * @device:Device to get context for
@@ -2705,9 +2818,6 @@ void ib_set_device_ops(struct ib_device *device,
 #if IS_ENABLED(CONFIG_INFINIBAND_USER_ACCESS)
 int rdma_user_mmap_io(struct ib_ucontext *ucontext, struct vm_area_struct *vma,
                      unsigned long pfn, unsigned long size, pgprot_t prot);
-int rdma_user_mmap_page(struct ib_ucontext *ucontext,
-                       struct vm_area_struct *vma, struct page *page,
-                       unsigned long size);
 #else
 static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
                                    struct vm_area_struct *vma,
@@ -2716,12 +2826,6 @@ static inline int rdma_user_mmap_io(struct ib_ucontext *ucontext,
 {
        return -EINVAL;
 }
-static inline int rdma_user_mmap_page(struct ib_ucontext *ucontext,
-                               struct vm_area_struct *vma, struct page *page,
-                               unsigned long size)
-{
-       return -EINVAL;
-}
 #endif
 
 static inline int ib_copy_from_udata(void *dest, struct ib_udata *udata, size_t len)
@@ -2978,8 +3082,8 @@ static inline bool rdma_cap_ib_mad(const struct ib_device *device, u8 port_num)
  */
 static inline bool rdma_cap_opa_mad(struct ib_device *device, u8 port_num)
 {
-       return (device->port_data[port_num].immutable.core_cap_flags &
-               RDMA_CORE_CAP_OPA_MAD) == RDMA_CORE_CAP_OPA_MAD;
+       return device->port_data[port_num].immutable.core_cap_flags &
+               RDMA_CORE_CAP_OPA_MAD;
 }
 
 /**
@@ -3195,6 +3299,30 @@ static inline bool rdma_cap_read_inv(struct ib_device *dev, u32 port_num)
        return rdma_protocol_iwarp(dev, port_num);
 }
 
+/**
+ * rdma_find_pg_bit - Find page bit given address and HW supported page sizes
+ *
+ * @addr: address
+ * @pgsz_bitmap: bitmap of HW supported page sizes
+ */
+static inline unsigned int rdma_find_pg_bit(unsigned long addr,
+                                           unsigned long pgsz_bitmap)
+{
+       unsigned long align;
+       unsigned long pgsz;
+
+       align = addr & -addr;
+
+       /* Find page bit such that addr is aligned to the highest supported
+        * HW page size
+        */
+       pgsz = pgsz_bitmap & ~(-align << 1);
+       if (!pgsz)
+               return __ffs(pgsz_bitmap);
+
+       return __fls(pgsz);
+}
+
 int ib_set_vf_link_state(struct ib_device *device, int vf, u8 port,
                         int state);
 int ib_get_vf_config(struct ib_device *device, int vf, u8 port,
@@ -3236,9 +3364,27 @@ enum ib_pd_flags {
 
 struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags,
                const char *caller);
+
 #define ib_alloc_pd(device, flags) \
        __ib_alloc_pd((device), (flags), KBUILD_MODNAME)
-void ib_dealloc_pd(struct ib_pd *pd);
+
+/**
+ * ib_dealloc_pd_user - Deallocate kernel/user PD
+ * @pd: The protection domain
+ * @udata: Valid user data or NULL for kernel objects
+ */
+void ib_dealloc_pd_user(struct ib_pd *pd, struct ib_udata *udata);
+
+/**
+ * ib_dealloc_pd - Deallocate kernel PD
+ * @pd: The protection domain
+ *
+ * NOTE: for user PD use ib_dealloc_pd_user with valid udata!
+ */
+static inline void ib_dealloc_pd(struct ib_pd *pd)
+{
+       ib_dealloc_pd_user(pd, NULL);
+}
 
 enum rdma_create_ah_flags {
        /* In a sleepable context */
@@ -3351,11 +3497,24 @@ enum rdma_destroy_ah_flags {
 };
 
 /**
- * rdma_destroy_ah - Destroys an address handle.
+ * rdma_destroy_ah_user - Destroys an address handle.
  * @ah: The address handle to destroy.
  * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
+ * @udata: Valid user data or NULL for kernel objects
  */
-int rdma_destroy_ah(struct ib_ah *ah, u32 flags);
+int rdma_destroy_ah_user(struct ib_ah *ah, u32 flags, struct ib_udata *udata);
+
+/**
+ * rdma_destroy_ah - Destroys an kernel address handle.
+ * @ah: The address handle to destroy.
+ * @flags: Destroy address handle flags (see enum rdma_destroy_ah_flags).
+ *
+ * NOTE: for user ah use rdma_destroy_ah_user with valid udata!
+ */
+static inline int rdma_destroy_ah(struct ib_ah *ah, u32 flags)
+{
+       return rdma_destroy_ah_user(ah, flags, NULL);
+}
 
 /**
  * ib_create_srq - Creates a SRQ associated with the specified protection
@@ -3399,10 +3558,22 @@ int ib_query_srq(struct ib_srq *srq,
                 struct ib_srq_attr *srq_attr);
 
 /**
- * ib_destroy_srq - Destroys the specified SRQ.
+ * ib_destroy_srq_user - Destroys the specified SRQ.
  * @srq: The SRQ to destroy.
+ * @udata: Valid user data or NULL for kernel objects
  */
-int ib_destroy_srq(struct ib_srq *srq);
+int ib_destroy_srq_user(struct ib_srq *srq, struct ib_udata *udata);
+
+/**
+ * ib_destroy_srq - Destroys the specified kernel SRQ.
+ * @srq: The SRQ to destroy.
+ *
+ * NOTE: for user srq use ib_destroy_srq_user with valid udata!
+ */
+static inline int ib_destroy_srq(struct ib_srq *srq)
+{
+       return ib_destroy_srq_user(srq, NULL);
+}
 
 /**
  * ib_post_srq_recv - Posts a list of work requests to the specified SRQ.
@@ -3422,15 +3593,34 @@ static inline int ib_post_srq_recv(struct ib_srq *srq,
 }
 
 /**
- * ib_create_qp - Creates a QP associated with the specified protection
+ * ib_create_qp_user - Creates a QP associated with the specified protection
  *   domain.
  * @pd: The protection domain associated with the QP.
  * @qp_init_attr: A list of initial attributes required to create the
  *   QP.  If QP creation succeeds, then the attributes are updated to
  *   the actual capabilities of the created QP.
+ * @udata: Valid user data or NULL for kernel objects
  */
-struct ib_qp *ib_create_qp(struct ib_pd *pd,
-                          struct ib_qp_init_attr *qp_init_attr);
+struct ib_qp *ib_create_qp_user(struct ib_pd *pd,
+                               struct ib_qp_init_attr *qp_init_attr,
+                               struct ib_udata *udata);
+
+/**
+ * ib_create_qp - Creates a kernel QP associated with the specified protection
+ *   domain.
+ * @pd: The protection domain associated with the QP.
+ * @qp_init_attr: A list of initial attributes required to create the
+ *   QP.  If QP creation succeeds, then the attributes are updated to
+ *   the actual capabilities of the created QP.
+ * @udata: Valid user data or NULL for kernel objects
+ *
+ * NOTE: for user qp use ib_create_qp_user with valid udata!
+ */
+static inline struct ib_qp *ib_create_qp(struct ib_pd *pd,
+                                        struct ib_qp_init_attr *qp_init_attr)
+{
+       return ib_create_qp_user(pd, qp_init_attr, NULL);
+}
 
 /**
  * ib_modify_qp_with_udata - Modifies the attributes for the specified QP.
@@ -3480,8 +3670,20 @@ int ib_query_qp(struct ib_qp *qp,
 /**
  * ib_destroy_qp - Destroys the specified QP.
  * @qp: The QP to destroy.
+ * @udata: Valid udata or NULL for kernel objects
  */
-int ib_destroy_qp(struct ib_qp *qp);
+int ib_destroy_qp_user(struct ib_qp *qp, struct ib_udata *udata);
+
+/**
+ * ib_destroy_qp - Destroys the specified kernel QP.
+ * @qp: The QP to destroy.
+ *
+ * NOTE: for user qp use ib_destroy_qp_user with valid udata!
+ */
+static inline int ib_destroy_qp(struct ib_qp *qp)
+{
+       return ib_destroy_qp_user(qp, NULL);
+}
 
 /**
  * ib_open_qp - Obtain a reference to an existing sharable QP.
@@ -3541,13 +3743,66 @@ static inline int ib_post_recv(struct ib_qp *qp,
        return qp->device->ops.post_recv(qp, recv_wr, bad_recv_wr ? : &dummy);
 }
 
-struct ib_cq *__ib_alloc_cq(struct ib_device *dev, void *private,
-                           int nr_cqe, int comp_vector,
-                           enum ib_poll_context poll_ctx, const char *caller);
-#define ib_alloc_cq(device, priv, nr_cqe, comp_vect, poll_ctx) \
-       __ib_alloc_cq((device), (priv), (nr_cqe), (comp_vect), (poll_ctx), KBUILD_MODNAME)
+struct ib_cq *__ib_alloc_cq_user(struct ib_device *dev, void *private,
+                                int nr_cqe, int comp_vector,
+                                enum ib_poll_context poll_ctx,
+                                const char *caller, struct ib_udata *udata);
+
+/**
+ * ib_alloc_cq_user: Allocate kernel/user CQ
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @comp_vector: Completion vector used for the IRQs
+ * @poll_ctx: Context used for polling the CQ
+ * @udata: Valid user data or NULL for kernel objects
+ */
+static inline struct ib_cq *ib_alloc_cq_user(struct ib_device *dev,
+                                            void *private, int nr_cqe,
+                                            int comp_vector,
+                                            enum ib_poll_context poll_ctx,
+                                            struct ib_udata *udata)
+{
+       return __ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+                                 KBUILD_MODNAME, udata);
+}
+
+/**
+ * ib_alloc_cq: Allocate kernel CQ
+ * @dev: The IB device
+ * @private: Private data attached to the CQE
+ * @nr_cqe: Number of CQEs in the CQ
+ * @comp_vector: Completion vector used for the IRQs
+ * @poll_ctx: Context used for polling the CQ
+ *
+ * NOTE: for user cq use ib_alloc_cq_user with valid udata!
+ */
+static inline struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private,
+                                       int nr_cqe, int comp_vector,
+                                       enum ib_poll_context poll_ctx)
+{
+       return ib_alloc_cq_user(dev, private, nr_cqe, comp_vector, poll_ctx,
+                               NULL);
+}
+
+/**
+ * ib_free_cq_user - Free kernel/user CQ
+ * @cq: The CQ to free
+ * @udata: Valid user data or NULL for kernel objects
+ */
+void ib_free_cq_user(struct ib_cq *cq, struct ib_udata *udata);
+
+/**
+ * ib_free_cq - Free kernel CQ
+ * @cq: The CQ to free
+ *
+ * NOTE: for user cq use ib_free_cq_user with valid udata!
+ */
+static inline void ib_free_cq(struct ib_cq *cq)
+{
+       ib_free_cq_user(cq, NULL);
+}
 
-void ib_free_cq(struct ib_cq *cq);
 int ib_process_cq_direct(struct ib_cq *cq, int budget);
 
 /**
@@ -3591,10 +3846,22 @@ int ib_resize_cq(struct ib_cq *cq, int cqe);
 int rdma_set_cq_moderation(struct ib_cq *cq, u16 cq_count, u16 cq_period);
 
 /**
- * ib_destroy_cq - Destroys the specified CQ.
+ * ib_destroy_cq_user - Destroys the specified CQ.
  * @cq: The CQ to destroy.
+ * @udata: Valid user data or NULL for kernel objects
  */
-int ib_destroy_cq(struct ib_cq *cq);
+int ib_destroy_cq_user(struct ib_cq *cq, struct ib_udata *udata);
+
+/**
+ * ib_destroy_cq - Destroys the specified kernel CQ.
+ * @cq: The CQ to destroy.
+ *
+ * NOTE: for user cq use ib_destroy_cq_user with valid udata!
+ */
+static inline int ib_destroy_cq(struct ib_cq *cq)
+{
+       return ib_destroy_cq_user(cq, NULL);
+}
 
 /**
  * ib_poll_cq - poll a CQ for completion(s)
@@ -3848,17 +4115,37 @@ static inline void ib_dma_free_coherent(struct ib_device *dev,
 }
 
 /**
- * ib_dereg_mr - Deregisters a memory region and removes it from the
+ * ib_dereg_mr_user - Deregisters a memory region and removes it from the
+ *   HCA translation table.
+ * @mr: The memory region to deregister.
+ * @udata: Valid user data or NULL for kernel object
+ *
+ * This function can fail, if the memory region has memory windows bound to it.
+ */
+int ib_dereg_mr_user(struct ib_mr *mr, struct ib_udata *udata);
+
+/**
+ * ib_dereg_mr - Deregisters a kernel memory region and removes it from the
  *   HCA translation table.
  * @mr: The memory region to deregister.
  *
  * This function can fail, if the memory region has memory windows bound to it.
+ *
+ * NOTE: for user mr use ib_dereg_mr_user with valid udata!
  */
-int ib_dereg_mr(struct ib_mr *mr);
+static inline int ib_dereg_mr(struct ib_mr *mr)
+{
+       return ib_dereg_mr_user(mr, NULL);
+}
+
+struct ib_mr *ib_alloc_mr_user(struct ib_pd *pd, enum ib_mr_type mr_type,
+                              u32 max_num_sg, struct ib_udata *udata);
 
-struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
-                         enum ib_mr_type mr_type,
-                         u32 max_num_sg);
+static inline struct ib_mr *ib_alloc_mr(struct ib_pd *pd,
+                                       enum ib_mr_type mr_type, u32 max_num_sg)
+{
+       return ib_alloc_mr_user(pd, mr_type, max_num_sg, NULL);
+}
 
 /**
  * ib_update_fast_reg_key - updates the key portion of the fast_reg MR
@@ -3956,8 +4243,9 @@ struct ib_xrcd *__ib_alloc_xrcd(struct ib_device *device, const char *caller);
 /**
  * ib_dealloc_xrcd - Deallocates an XRC domain.
  * @xrcd: The XRC domain to deallocate.
+ * @udata: Valid user data or NULL for kernel object
  */
-int ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd, struct ib_udata *udata);
 
 static inline int ib_check_mr_access(int flags)
 {
@@ -4033,7 +4321,7 @@ struct net_device *ib_device_netdev(struct ib_device *dev, u8 port);
 
 struct ib_wq *ib_create_wq(struct ib_pd *pd,
                           struct ib_wq_init_attr *init_attr);
-int ib_destroy_wq(struct ib_wq *wq);
+int ib_destroy_wq(struct ib_wq *wq, struct ib_udata *udata);
 int ib_modify_wq(struct ib_wq *wq, struct ib_wq_attr *attr,
                 u32 wq_attr_mask);
 struct ib_rwq_ind_table *ib_create_rwq_ind_table(struct ib_device *device,
@@ -4349,7 +4637,10 @@ rdma_set_device_sysfs_group(struct ib_device *dev,
  */
 static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
 {
-       return container_of(device, struct ib_device, dev);
+       struct ib_core_device *coredev =
+               container_of(device, struct ib_core_device, dev);
+
+       return coredev->owner;
 }
 
 /**
@@ -4362,4 +4653,7 @@ static inline struct ib_device *rdma_device_to_ibdev(struct device *device)
  */
 #define rdma_device_to_drv_device(dev, drv_dev_struct, ibdev_member)           \
        container_of(rdma_device_to_ibdev(dev), drv_dev_struct, ibdev_member)
+
+bool rdma_dev_access_netns(const struct ib_device *device,
+                          const struct net *net);
 #endif /* IB_VERBS_H */
index 0e1f028..5aa8a9c 100644 (file)
@@ -118,31 +118,6 @@ enum iw_flags {
        IW_F_NO_PORT_MAP = (1 << 0),
 };
 
-struct iw_cm_verbs {
-       void            (*add_ref)(struct ib_qp *qp);
-
-       void            (*rem_ref)(struct ib_qp *qp);
-
-       struct ib_qp *  (*get_qp)(struct ib_device *device,
-                                 int qpn);
-
-       int             (*connect)(struct iw_cm_id *cm_id,
-                                  struct iw_cm_conn_param *conn_param);
-
-       int             (*accept)(struct iw_cm_id *cm_id,
-                                 struct iw_cm_conn_param *conn_param);
-
-       int             (*reject)(struct iw_cm_id *cm_id,
-                                 const void *pdata, u8 pdata_len);
-
-       int             (*create_listen)(struct iw_cm_id *cm_id,
-                                        int backlog);
-
-       int             (*destroy_listen)(struct iw_cm_id *cm_id);
-       char            ifname[IFNAMSIZ];
-       enum iw_flags   driver_flags;
-};
-
 /**
  * iw_create_cm_id - Create an IW CM identifier.
  *
index b4f0ac0..7147a92 100644 (file)
@@ -413,6 +413,6 @@ struct opa_port_info {
        u8     local_port_num;
        u8     reserved12;
        u8     reserved13;                       /* was guid_cap */
-} __attribute__ ((packed));
+} __packed;
 
 #endif /* OPA_PORT_INFO_H */
index f789611..c7b2ef1 100644 (file)
@@ -98,7 +98,7 @@ struct opa_smp {
 
 struct opa_node_description {
        u8 data[64];
-} __attribute__ ((packed));
+} __packed;
 
 struct opa_node_info {
        u8      base_version;
@@ -114,7 +114,7 @@ struct opa_node_info {
        __be32  revision;
        u8      local_port_num;
        u8      vendor_id[3];   /* network byte order */
-} __attribute__ ((packed));
+} __packed;
 
 #define OPA_PARTITION_TABLE_BLK_SIZE 32
 
index 4c257af..b9cd06d 100644 (file)
@@ -59,7 +59,6 @@
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_mad.h>
 #include <rdma/rdmavt_mr.h>
-#include <rdma/rdmavt_qp.h>
 
 #define RVT_MAX_PKEY_VALUES 16
 
@@ -72,6 +71,8 @@ struct trap_list {
        struct list_head list;
 };
 
+struct rvt_qp;
+struct rvt_qpn_table;
 struct rvt_ibport {
        struct rvt_qp __rcu *qp[2];
        struct ib_mad_agent *send_agent;        /* agent for SMI (traps) */
@@ -206,6 +207,20 @@ struct rvt_ah {
        u8 log_pmtu;
 };
 
+/*
+ * This structure is used by rvt_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct rvt_mmap_info {
+       struct list_head pending_mmaps;
+       struct ib_ucontext *context;
+       void *obj;
+       __u64 offset;
+       struct kref ref;
+       u32 size;
+};
+
 /* memory working set size */
 struct rvt_wss {
        unsigned long *entries;
@@ -501,16 +516,6 @@ static inline struct rvt_dev_info *ib_to_rvt(struct ib_device *ibdev)
        return  container_of(ibdev, struct rvt_dev_info, ibdev);
 }
 
-static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq)
-{
-       return container_of(ibsrq, struct rvt_srq, ibsrq);
-}
-
-static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp)
-{
-       return container_of(ibqp, struct rvt_qp, ibqp);
-}
-
 static inline unsigned rvt_get_npkeys(struct rvt_dev_info *rdi)
 {
        /*
@@ -548,57 +553,6 @@ static inline u16 rvt_get_pkey(struct rvt_dev_info *rdi,
                return rdi->ports[port_index]->pkey_table[index];
 }
 
-/**
- * rvt_lookup_qpn - return the QP with the given QPN
- * @ibp: the ibport
- * @qpn: the QP number to look up
- *
- * The caller must hold the rcu_read_lock(), and keep the lock until
- * the returned qp is no longer in use.
- */
-/* TODO: Remove this and put in rdmavt/qp.h when no longer needed by drivers */
-static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
-                                           struct rvt_ibport *rvp,
-                                           u32 qpn) __must_hold(RCU)
-{
-       struct rvt_qp *qp = NULL;
-
-       if (unlikely(qpn <= 1)) {
-               qp = rcu_dereference(rvp->qp[qpn]);
-       } else {
-               u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits);
-
-               for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp;
-                       qp = rcu_dereference(qp->next))
-                       if (qp->ibqp.qp_num == qpn)
-                               break;
-       }
-       return qp;
-}
-
-/**
- * rvt_mod_retry_timer - mod a retry timer
- * @qp - the QP
- * @shift - timeout shift to wait for multiple packets
- * Modify a potentially already running retry timer
- */
-static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
-{
-       struct ib_qp *ibqp = &qp->ibqp;
-       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
-
-       lockdep_assert_held(&qp->s_lock);
-       qp->s_flags |= RVT_S_TIMER;
-       /* 4.096 usec. * (1 << qp->timeout) */
-       mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
-                 (qp->timeout_jiffies << shift));
-}
-
-static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
-{
-       return rvt_mod_retry_timer_ext(qp, 0);
-}
-
 struct rvt_dev_info *rvt_alloc_device(size_t size, int nports);
 void rvt_dealloc_device(struct rvt_dev_info *rdi);
 int rvt_register_device(struct rvt_dev_info *rvd, u32 driver_id);
index f0fbd40..68e38c2 100644 (file)
@@ -83,7 +83,6 @@
  * RVT_S_WAIT_DMA - waiting for send DMA queue to drain before generating
  *                  next send completion entry not via send DMA
  * RVT_S_WAIT_PIO - waiting for a send buffer to be available
- * RVT_S_WAIT_PIO_DRAIN - waiting for a qp to drain pio packets
  * RVT_S_WAIT_TX - waiting for a struct verbs_txreq to be available
  * RVT_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
  * RVT_S_WAIT_KMEM - waiting for kernel memory to be available
@@ -211,20 +210,6 @@ struct rvt_rq {
        spinlock_t lock ____cacheline_aligned_in_smp;
 };
 
-/*
- * This structure is used by rvt_mmap() to validate an offset
- * when an mmap() request is made.  The vm_area_struct then uses
- * this as its vm_private_data.
- */
-struct rvt_mmap_info {
-       struct list_head pending_mmaps;
-       struct ib_ucontext *context;
-       void *obj;
-       __u64 offset;
-       struct kref ref;
-       unsigned size;
-};
-
 /*
  * This structure holds the information that the send tasklet needs
  * to send a RDMA read response or atomic operation.
@@ -399,6 +384,16 @@ struct rvt_srq {
        u32 limit;
 };
 
+static inline struct rvt_srq *ibsrq_to_rvtsrq(struct ib_srq *ibsrq)
+{
+       return container_of(ibsrq, struct rvt_srq, ibsrq);
+}
+
+static inline struct rvt_qp *ibqp_to_rvtqp(struct ib_qp *ibqp)
+{
+       return container_of(ibqp, struct rvt_qp, ibqp);
+}
+
 #define RVT_QPN_MAX                 BIT(24)
 #define RVT_QPNMAP_ENTRIES          (RVT_QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
 #define RVT_BITS_PER_PAGE           (PAGE_SIZE * BITS_PER_BYTE)
@@ -678,6 +673,70 @@ static inline unsigned long rvt_timeout_to_jiffies(u8 timeout)
        return usecs_to_jiffies(1U << timeout) * 4096UL / 1000UL;
 }
 
+/**
+ * rvt_lookup_qpn - return the QP with the given QPN
+ * @ibp: the ibport
+ * @qpn: the QP number to look up
+ *
+ * The caller must hold the rcu_read_lock(), and keep the lock until
+ * the returned qp is no longer in use.
+ */
+static inline struct rvt_qp *rvt_lookup_qpn(struct rvt_dev_info *rdi,
+                                           struct rvt_ibport *rvp,
+                                           u32 qpn) __must_hold(RCU)
+{
+       struct rvt_qp *qp = NULL;
+
+       if (unlikely(qpn <= 1)) {
+               qp = rcu_dereference(rvp->qp[qpn]);
+       } else {
+               u32 n = hash_32(qpn, rdi->qp_dev->qp_table_bits);
+
+               for (qp = rcu_dereference(rdi->qp_dev->qp_table[n]); qp;
+                       qp = rcu_dereference(qp->next))
+                       if (qp->ibqp.qp_num == qpn)
+                               break;
+       }
+       return qp;
+}
+
+/**
+ * rvt_mod_retry_timer - mod a retry timer
+ * @qp - the QP
+ * @shift - timeout shift to wait for multiple packets
+ * Modify a potentially already running retry timer
+ */
+static inline void rvt_mod_retry_timer_ext(struct rvt_qp *qp, u8 shift)
+{
+       struct ib_qp *ibqp = &qp->ibqp;
+       struct rvt_dev_info *rdi = ib_to_rvt(ibqp->device);
+
+       lockdep_assert_held(&qp->s_lock);
+       qp->s_flags |= RVT_S_TIMER;
+       /* 4.096 usec. * (1 << qp->timeout) */
+       mod_timer(&qp->s_timer, jiffies + rdi->busy_jiffies +
+                 (qp->timeout_jiffies << shift));
+}
+
+static inline void rvt_mod_retry_timer(struct rvt_qp *qp)
+{
+       return rvt_mod_retry_timer_ext(qp, 0);
+}
+
+/**
+ * rvt_put_qp_swqe - drop refs held by swqe
+ * @qp: the send qp
+ * @wqe: the send wqe
+ *
+ * This drops any references held by the swqe
+ */
+static inline void rvt_put_qp_swqe(struct rvt_qp *qp, struct rvt_swqe *wqe)
+{
+       rvt_put_swqe(wqe);
+       if (qp->allowed_ops == IB_OPCODE_UD)
+               atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
+}
+
 extern const int  ib_rvt_state_ops[];
 
 struct rvt_dev_info;
index 794c475..05eabfd 100644 (file)
 #define uobj_get_type(_attrs, _object)                                         \
        uapi_get_object((_attrs)->ufile->device->uapi, _object)
 
-struct ib_uobject *_uobj_get_read(enum uverbs_default_objects type,
-                                 u32 object_id,
-                                 struct uverbs_attr_bundle *attrs);
-
 #define uobj_get_read(_type, _id, _attrs)                                      \
-       _uobj_get_read(_type, _uobj_check_id(_id), _attrs)
+       rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
+                               _uobj_check_id(_id), UVERBS_LOOKUP_READ,       \
+                               _attrs)
 
 #define ufd_get_read(_type, _fdnum, _attrs)                                    \
        rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
                                (_fdnum)*typecheck(s32, _fdnum),               \
-                               UVERBS_LOOKUP_READ)
+                               UVERBS_LOOKUP_READ, _attrs)
 
 static inline void *_uobj_get_obj_read(struct ib_uobject *uobj)
 {
@@ -70,22 +68,19 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj)
        ((struct ib_##_object *)_uobj_get_obj_read(                            \
                uobj_get_read(_type, _id, _attrs)))
 
-struct ib_uobject *_uobj_get_write(enum uverbs_default_objects type,
-                                  u32 object_id,
-                                  struct uverbs_attr_bundle *attrs);
-
 #define uobj_get_write(_type, _id, _attrs)                                     \
-       _uobj_get_write(_type, _uobj_check_id(_id), _attrs)
+       rdma_lookup_get_uobject(uobj_get_type(_attrs, _type), (_attrs)->ufile, \
+                               _uobj_check_id(_id), UVERBS_LOOKUP_WRITE,      \
+                               _attrs)
 
 int __uobj_perform_destroy(const struct uverbs_api_object *obj, u32 id,
-                          const struct uverbs_attr_bundle *attrs);
+                          struct uverbs_attr_bundle *attrs);
 #define uobj_perform_destroy(_type, _id, _attrs)                               \
        __uobj_perform_destroy(uobj_get_type(_attrs, _type),                   \
                               _uobj_check_id(_id), _attrs)
 
 struct ib_uobject *__uobj_get_destroy(const struct uverbs_api_object *obj,
-                                     u32 id,
-                                     const struct uverbs_attr_bundle *attrs);
+                                     u32 id, struct uverbs_attr_bundle *attrs);
 
 #define uobj_get_destroy(_type, _id, _attrs)                                   \
        __uobj_get_destroy(uobj_get_type(_attrs, _type), _uobj_check_id(_id),  \
@@ -109,30 +104,31 @@ static inline void uobj_put_write(struct ib_uobject *uobj)
        rdma_lookup_put_uobject(uobj, UVERBS_LOOKUP_WRITE);
 }
 
-static inline int __must_check uobj_alloc_commit(struct ib_uobject *uobj)
+static inline int __must_check
+uobj_alloc_commit(struct ib_uobject *uobj, struct uverbs_attr_bundle *attrs)
 {
-       int ret = rdma_alloc_commit_uobject(uobj);
+       int ret = rdma_alloc_commit_uobject(uobj, attrs);
 
        if (ret)
                return ret;
        return 0;
 }
 
-static inline void uobj_alloc_abort(struct ib_uobject *uobj)
+static inline void uobj_alloc_abort(struct ib_uobject *uobj,
+                                   struct uverbs_attr_bundle *attrs)
 {
-       rdma_alloc_abort_uobject(uobj);
+       rdma_alloc_abort_uobject(uobj, attrs);
 }
 
 static inline struct ib_uobject *
 __uobj_alloc(const struct uverbs_api_object *obj,
             struct uverbs_attr_bundle *attrs, struct ib_device **ib_dev)
 {
-       struct ib_uobject *uobj = rdma_alloc_begin_uobject(obj, attrs->ufile);
+       struct ib_uobject *uobj =
+               rdma_alloc_begin_uobject(obj, attrs->ufile, attrs);
 
-       if (!IS_ERR(uobj)) {
-               *ib_dev = uobj->context->device;
-               attrs->context = uobj->context;
-       }
+       if (!IS_ERR(uobj))
+               *ib_dev = attrs->context->device;
        return uobj;
 }
 
index 175d761..d57a5ba 100644 (file)
@@ -95,7 +95,8 @@ struct uverbs_obj_type_class {
        void (*lookup_put)(struct ib_uobject *uobj, enum rdma_lookup_mode mode);
        /* This does not consume the kref on uobj */
        int __must_check (*destroy_hw)(struct ib_uobject *uobj,
-                                      enum rdma_remove_reason why);
+                                      enum rdma_remove_reason why,
+                                      struct uverbs_attr_bundle *attrs);
        void (*remove_handle)(struct ib_uobject *uobj);
        u8    needs_kfree_rcu;
 };
@@ -126,18 +127,23 @@ struct uverbs_obj_idr_type {
         * completely unchanged.
         */
        int __must_check (*destroy_object)(struct ib_uobject *uobj,
-                                          enum rdma_remove_reason why);
+                                          enum rdma_remove_reason why,
+                                          struct uverbs_attr_bundle *attrs);
 };
 
 struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_api_object *obj,
                                           struct ib_uverbs_file *ufile, s64 id,
-                                          enum rdma_lookup_mode mode);
+                                          enum rdma_lookup_mode mode,
+                                          struct uverbs_attr_bundle *attrs);
 void rdma_lookup_put_uobject(struct ib_uobject *uobj,
                             enum rdma_lookup_mode mode);
 struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_api_object *obj,
-                                           struct ib_uverbs_file *ufile);
-void rdma_alloc_abort_uobject(struct ib_uobject *uobj);
-int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj);
+                                           struct ib_uverbs_file *ufile,
+                                           struct uverbs_attr_bundle *attrs);
+void rdma_alloc_abort_uobject(struct ib_uobject *uobj,
+                             struct uverbs_attr_bundle *attrs);
+int __must_check rdma_alloc_commit_uobject(struct ib_uobject *uobj,
+                                          struct uverbs_attr_bundle *attrs);
 
 struct uverbs_obj_fd_type {
        /*
diff --git a/include/trace/events/ib_mad.h b/include/trace/events/ib_mad.h
new file mode 100644 (file)
index 0000000..59363a0
--- /dev/null
@@ -0,0 +1,390 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+
+/*
+ * Copyright (c) 2018 Intel Corporation.  All rights reserved.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ib_mad
+
+#if !defined(_TRACE_IB_MAD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IB_MAD_H
+
+#include <linux/tracepoint.h>
+#include <rdma/ib_mad.h>
+
+#ifdef CONFIG_TRACEPOINTS
+struct trace_event_raw_ib_mad_send_template;
+static void create_mad_addr_info(struct ib_mad_send_wr_private *mad_send_wr,
+                         struct ib_mad_qp_info *qp_info,
+                         struct trace_event_raw_ib_mad_send_template *entry);
+#endif
+
+DECLARE_EVENT_CLASS(ib_mad_send_template,
+       TP_PROTO(struct ib_mad_send_wr_private *wr,
+                struct ib_mad_qp_info *qp_info),
+       TP_ARGS(wr, qp_info),
+
+       TP_STRUCT__entry(
+               __field(u8,             base_version)
+               __field(u8,             mgmt_class)
+               __field(u8,             class_version)
+               __field(u8,             port_num)
+               __field(u32,            qp_num)
+               __field(u8,             method)
+               __field(u8,             sl)
+               __field(u16,            attr_id)
+               __field(u32,            attr_mod)
+               __field(u64,            wrtid)
+               __field(u64,            tid)
+               __field(u16,            status)
+               __field(u16,            class_specific)
+               __field(u32,            length)
+               __field(u32,            dlid)
+               __field(u32,            rqpn)
+               __field(u32,            rqkey)
+               __field(u32,            dev_index)
+               __field(void *,         agent_priv)
+               __field(unsigned long,  timeout)
+               __field(int,            retries_left)
+               __field(int,            max_retries)
+               __field(int,            retry)
+               __field(u16,            pkey)
+       ),
+
+       TP_fast_assign(
+               __entry->dev_index = wr->mad_agent_priv->agent.device->index;
+               __entry->port_num = wr->mad_agent_priv->agent.port_num;
+               __entry->qp_num = wr->mad_agent_priv->qp_info->qp->qp_num;
+               __entry->agent_priv = wr->mad_agent_priv;
+               __entry->wrtid = wr->tid;
+               __entry->max_retries = wr->max_retries;
+               __entry->retries_left = wr->retries_left;
+               __entry->retry = wr->retry;
+               __entry->timeout = wr->timeout;
+               __entry->length = wr->send_buf.hdr_len +
+                                 wr->send_buf.data_len;
+               __entry->base_version =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->base_version;
+               __entry->mgmt_class =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->mgmt_class;
+               __entry->class_version =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->class_version;
+               __entry->method =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->method;
+               __entry->status =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->status;
+               __entry->class_specific =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->class_specific;
+               __entry->tid = ((struct ib_mad_hdr *)wr->send_buf.mad)->tid;
+               __entry->attr_id =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->attr_id;
+               __entry->attr_mod =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->attr_mod;
+               create_mad_addr_info(wr, qp_info, __entry);
+       ),
+
+       TP_printk("%d:%d QP%d agent %p: " \
+                 "wrtid 0x%llx; %d/%d retries(%d); timeout %lu length %d : " \
+                 "hdr : base_ver 0x%x class 0x%x class_ver 0x%x " \
+                 "method 0x%x status 0x%x class_specific 0x%x tid 0x%llx " \
+                 "attr_id 0x%x attr_mod 0x%x  => dlid 0x%08x sl %d "\
+                 "pkey 0x%x rpqn 0x%x rqpkey 0x%x",
+               __entry->dev_index, __entry->port_num, __entry->qp_num,
+               __entry->agent_priv, be64_to_cpu(__entry->wrtid),
+               __entry->retries_left, __entry->max_retries,
+               __entry->retry, __entry->timeout, __entry->length,
+               __entry->base_version, __entry->mgmt_class,
+               __entry->class_version,
+               __entry->method, be16_to_cpu(__entry->status),
+               be16_to_cpu(__entry->class_specific),
+               be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id),
+               be32_to_cpu(__entry->attr_mod),
+               be32_to_cpu(__entry->dlid), __entry->sl, __entry->pkey,
+               __entry->rqpn, __entry->rqkey
+       )
+);
+
+DEFINE_EVENT(ib_mad_send_template, ib_mad_error_handler,
+       TP_PROTO(struct ib_mad_send_wr_private *wr,
+                struct ib_mad_qp_info *qp_info),
+       TP_ARGS(wr, qp_info));
+DEFINE_EVENT(ib_mad_send_template, ib_mad_ib_send_mad,
+       TP_PROTO(struct ib_mad_send_wr_private *wr,
+                struct ib_mad_qp_info *qp_info),
+       TP_ARGS(wr, qp_info));
+DEFINE_EVENT(ib_mad_send_template, ib_mad_send_done_resend,
+       TP_PROTO(struct ib_mad_send_wr_private *wr,
+                struct ib_mad_qp_info *qp_info),
+       TP_ARGS(wr, qp_info));
+
+TRACE_EVENT(ib_mad_send_done_handler,
+       TP_PROTO(struct ib_mad_send_wr_private *wr, struct ib_wc *wc),
+       TP_ARGS(wr, wc),
+
+       TP_STRUCT__entry(
+               __field(u8,             port_num)
+               __field(u8,             base_version)
+               __field(u8,             mgmt_class)
+               __field(u8,             class_version)
+               __field(u32,            qp_num)
+               __field(u64,            wrtid)
+               __field(u16,            status)
+               __field(u16,            wc_status)
+               __field(u32,            length)
+               __field(void *,         agent_priv)
+               __field(unsigned long,  timeout)
+               __field(u32,            dev_index)
+               __field(int,            retries_left)
+               __field(int,            max_retries)
+               __field(int,            retry)
+               __field(u8,             method)
+       ),
+
+       TP_fast_assign(
+               __entry->dev_index = wr->mad_agent_priv->agent.device->index;
+               __entry->port_num = wr->mad_agent_priv->agent.port_num;
+               __entry->qp_num = wr->mad_agent_priv->qp_info->qp->qp_num;
+               __entry->agent_priv = wr->mad_agent_priv;
+               __entry->wrtid = wr->tid;
+               __entry->max_retries = wr->max_retries;
+               __entry->retries_left = wr->retries_left;
+               __entry->retry = wr->retry;
+               __entry->timeout = wr->timeout;
+               __entry->base_version =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->base_version;
+               __entry->mgmt_class =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->mgmt_class;
+               __entry->class_version =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->class_version;
+               __entry->method =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->method;
+               __entry->status =
+                       ((struct ib_mad_hdr *)wr->send_buf.mad)->status;
+               __entry->wc_status = wc->status;
+               __entry->length = wc->byte_len;
+       ),
+
+       TP_printk("%d:%d QP%d : SEND WC Status %d : agent %p: " \
+                 "wrtid 0x%llx %d/%d retries(%d) timeout %lu length %d: " \
+                 "hdr : base_ver 0x%x class 0x%x class_ver 0x%x " \
+                 "method 0x%x status 0x%x",
+               __entry->dev_index, __entry->port_num, __entry->qp_num,
+               __entry->wc_status,
+               __entry->agent_priv, be64_to_cpu(__entry->wrtid),
+               __entry->retries_left, __entry->max_retries,
+               __entry->retry, __entry->timeout,
+               __entry->length,
+               __entry->base_version, __entry->mgmt_class,
+               __entry->class_version, __entry->method,
+               be16_to_cpu(__entry->status)
+       )
+);
+
+TRACE_EVENT(ib_mad_recv_done_handler,
+       TP_PROTO(struct ib_mad_qp_info *qp_info, struct ib_wc *wc,
+                struct ib_mad_hdr *mad_hdr),
+       TP_ARGS(qp_info, wc, mad_hdr),
+
+       TP_STRUCT__entry(
+               __field(u8,             base_version)
+               __field(u8,             mgmt_class)
+               __field(u8,             class_version)
+               __field(u8,             port_num)
+               __field(u32,            qp_num)
+               __field(u16,            status)
+               __field(u16,            class_specific)
+               __field(u32,            length)
+               __field(u64,            tid)
+               __field(u8,             method)
+               __field(u8,             sl)
+               __field(u16,            attr_id)
+               __field(u32,            attr_mod)
+               __field(u16,            src_qp)
+               __field(u16,            wc_status)
+               __field(u32,            slid)
+               __field(u32,            dev_index)
+               __field(u16,            pkey)
+       ),
+
+       TP_fast_assign(
+               __entry->dev_index = qp_info->port_priv->device->index;
+               __entry->port_num = qp_info->port_priv->port_num;
+               __entry->qp_num = qp_info->qp->qp_num;
+               __entry->length = wc->byte_len;
+               __entry->base_version = mad_hdr->base_version;
+               __entry->mgmt_class = mad_hdr->mgmt_class;
+               __entry->class_version = mad_hdr->class_version;
+               __entry->method = mad_hdr->method;
+               __entry->status = mad_hdr->status;
+               __entry->class_specific = mad_hdr->class_specific;
+               __entry->tid = mad_hdr->tid;
+               __entry->attr_id = mad_hdr->attr_id;
+               __entry->attr_mod = mad_hdr->attr_mod;
+               __entry->slid = wc->slid;
+               __entry->src_qp = wc->src_qp;
+               __entry->sl = wc->sl;
+               ib_query_pkey(qp_info->port_priv->device,
+                             qp_info->port_priv->port_num,
+                             wc->pkey_index, &__entry->pkey);
+               __entry->wc_status = wc->status;
+       ),
+
+       TP_printk("%d:%d QP%d : RECV WC Status %d : length %d : hdr : " \
+                 "base_ver 0x%02x class 0x%02x class_ver 0x%02x " \
+                 "method 0x%02x status 0x%04x class_specific 0x%04x " \
+                 "tid 0x%016llx attr_id 0x%04x attr_mod 0x%08x " \
+                 "slid 0x%08x src QP%d, sl %d pkey 0x%04x",
+               __entry->dev_index, __entry->port_num, __entry->qp_num,
+               __entry->wc_status,
+               __entry->length,
+               __entry->base_version, __entry->mgmt_class,
+               __entry->class_version, __entry->method,
+               be16_to_cpu(__entry->status),
+               be16_to_cpu(__entry->class_specific),
+               be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id),
+               be32_to_cpu(__entry->attr_mod),
+               __entry->slid, __entry->src_qp, __entry->sl, __entry->pkey
+       )
+);
+
+DECLARE_EVENT_CLASS(ib_mad_agent_template,
+       TP_PROTO(struct ib_mad_agent_private *agent),
+       TP_ARGS(agent),
+
+       TP_STRUCT__entry(
+               __field(u32,            dev_index)
+               __field(u32,            hi_tid)
+               __field(u8,             port_num)
+               __field(u8,             mgmt_class)
+               __field(u8,             mgmt_class_version)
+       ),
+
+       TP_fast_assign(
+               __entry->dev_index = agent->agent.device->index;
+               __entry->port_num = agent->agent.port_num;
+               __entry->hi_tid = agent->agent.hi_tid;
+
+               if (agent->reg_req) {
+                       __entry->mgmt_class = agent->reg_req->mgmt_class;
+                       __entry->mgmt_class_version =
+                               agent->reg_req->mgmt_class_version;
+               } else {
+                       __entry->mgmt_class = 0;
+                       __entry->mgmt_class_version = 0;
+               }
+       ),
+
+       TP_printk("%d:%d mad agent : hi_tid 0x%08x class 0x%02x class_ver 0x%02x",
+               __entry->dev_index, __entry->port_num,
+               __entry->hi_tid, __entry->mgmt_class,
+               __entry->mgmt_class_version
+       )
+);
+DEFINE_EVENT(ib_mad_agent_template, ib_mad_recv_done_agent,
+       TP_PROTO(struct ib_mad_agent_private *agent),
+       TP_ARGS(agent));
+DEFINE_EVENT(ib_mad_agent_template, ib_mad_send_done_agent,
+       TP_PROTO(struct ib_mad_agent_private *agent),
+       TP_ARGS(agent));
+DEFINE_EVENT(ib_mad_agent_template, ib_mad_create_agent,
+       TP_PROTO(struct ib_mad_agent_private *agent),
+       TP_ARGS(agent));
+DEFINE_EVENT(ib_mad_agent_template, ib_mad_unregister_agent,
+       TP_PROTO(struct ib_mad_agent_private *agent),
+       TP_ARGS(agent));
+
+
+
+DECLARE_EVENT_CLASS(ib_mad_opa_smi_template,
+       TP_PROTO(struct opa_smp *smp),
+       TP_ARGS(smp),
+
+       TP_STRUCT__entry(
+               __field(u64,            mkey)
+               __field(u32,            dr_slid)
+               __field(u32,            dr_dlid)
+               __field(u8,             hop_ptr)
+               __field(u8,             hop_cnt)
+               __array(u8,             initial_path, OPA_SMP_MAX_PATH_HOPS)
+               __array(u8,             return_path, OPA_SMP_MAX_PATH_HOPS)
+       ),
+
+       TP_fast_assign(
+               __entry->hop_ptr = smp->hop_ptr;
+               __entry->hop_cnt = smp->hop_cnt;
+               __entry->mkey = smp->mkey;
+               __entry->dr_slid = smp->route.dr.dr_slid;
+               __entry->dr_dlid = smp->route.dr.dr_dlid;
+               memcpy(__entry->initial_path, smp->route.dr.initial_path,
+                       OPA_SMP_MAX_PATH_HOPS);
+               memcpy(__entry->return_path, smp->route.dr.return_path,
+                       OPA_SMP_MAX_PATH_HOPS);
+       ),
+
+       TP_printk("OPA SMP: hop_ptr %d hop_cnt %d " \
+                 "mkey 0x%016llx dr_slid 0x%08x dr_dlid 0x%08x " \
+                 "initial_path %*ph return_path %*ph ",
+               __entry->hop_ptr, __entry->hop_cnt,
+               be64_to_cpu(__entry->mkey), be32_to_cpu(__entry->dr_slid),
+               be32_to_cpu(__entry->dr_dlid),
+               OPA_SMP_MAX_PATH_HOPS, __entry->initial_path,
+               OPA_SMP_MAX_PATH_HOPS, __entry->return_path
+       )
+);
+
+DEFINE_EVENT(ib_mad_opa_smi_template, ib_mad_handle_opa_smi,
+       TP_PROTO(struct opa_smp *smp),
+       TP_ARGS(smp));
+DEFINE_EVENT(ib_mad_opa_smi_template, ib_mad_handle_out_opa_smi,
+       TP_PROTO(struct opa_smp *smp),
+       TP_ARGS(smp));
+
+
+DECLARE_EVENT_CLASS(ib_mad_opa_ib_template,
+       TP_PROTO(struct ib_smp *smp),
+       TP_ARGS(smp),
+
+       TP_STRUCT__entry(
+               __field(u64,            mkey)
+               __field(u32,            dr_slid)
+               __field(u32,            dr_dlid)
+               __field(u8,             hop_ptr)
+               __field(u8,             hop_cnt)
+               __array(u8,             initial_path, IB_SMP_MAX_PATH_HOPS)
+               __array(u8,             return_path, IB_SMP_MAX_PATH_HOPS)
+       ),
+
+       TP_fast_assign(
+               __entry->hop_ptr = smp->hop_ptr;
+               __entry->hop_cnt = smp->hop_cnt;
+               __entry->mkey = smp->mkey;
+               __entry->dr_slid = smp->dr_slid;
+               __entry->dr_dlid = smp->dr_dlid;
+               memcpy(__entry->initial_path, smp->initial_path,
+                       IB_SMP_MAX_PATH_HOPS);
+               memcpy(__entry->return_path, smp->return_path,
+                       IB_SMP_MAX_PATH_HOPS);
+       ),
+
+       TP_printk("OPA SMP: hop_ptr %d hop_cnt %d " \
+                 "mkey 0x%016llx dr_slid 0x%04x dr_dlid 0x%04x " \
+                 "initial_path %*ph return_path %*ph ",
+               __entry->hop_ptr, __entry->hop_cnt,
+               be64_to_cpu(__entry->mkey), be16_to_cpu(__entry->dr_slid),
+               be16_to_cpu(__entry->dr_dlid),
+               IB_SMP_MAX_PATH_HOPS, __entry->initial_path,
+               IB_SMP_MAX_PATH_HOPS, __entry->return_path
+       )
+);
+
+DEFINE_EVENT(ib_mad_opa_ib_template, ib_mad_handle_ib_smi,
+       TP_PROTO(struct ib_smp *smp),
+       TP_ARGS(smp));
+DEFINE_EVENT(ib_mad_opa_ib_template, ib_mad_handle_out_ib_smi,
+       TP_PROTO(struct ib_smp *smp),
+       TP_ARGS(smp));
+
+#endif /* _TRACE_IB_MAD_H */
+
+#include <trace/define_trace.h>
diff --git a/include/trace/events/ib_umad.h b/include/trace/events/ib_umad.h
new file mode 100644 (file)
index 0000000..c393a19
--- /dev/null
@@ -0,0 +1,126 @@
+/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
+
+/*
+ * Copyright (c) 2018 Intel Corporation.  All rights reserved.
+ *
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ib_umad
+
+#if !defined(_TRACE_IB_UMAD_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IB_UMAD_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(ib_umad_template,
+       TP_PROTO(struct ib_umad_file *file, struct ib_user_mad_hdr *umad_hdr,
+                struct ib_mad_hdr *mad_hdr),
+       TP_ARGS(file, umad_hdr, mad_hdr),
+
+       TP_STRUCT__entry(
+               __field(u8, port_num)
+               __field(u8, sl)
+               __field(u8, path_bits)
+               __field(u8, grh_present)
+               __field(u32, id)
+               __field(u32, status)
+               __field(u32, timeout_ms)
+               __field(u32, retires)
+               __field(u32, length)
+               __field(u32, qpn)
+               __field(u32, qkey)
+               __field(u8, gid_index)
+               __field(u8, hop_limit)
+               __field(u16, lid)
+               __field(u16, attr_id)
+               __field(u16, pkey_index)
+               __field(u8, base_version)
+               __field(u8, mgmt_class)
+               __field(u8, class_version)
+               __field(u8, method)
+               __field(u32, flow_label)
+               __field(u16, mad_status)
+               __field(u16, class_specific)
+               __field(u32, attr_mod)
+               __field(u64, tid)
+               __array(u8, gid, 16)
+               __field(u32, dev_index)
+               __field(u8,  traffic_class)
+       ),
+
+       TP_fast_assign(
+               __entry->dev_index = file->port->ib_dev->index;
+               __entry->port_num = file->port->port_num;
+
+               __entry->id = umad_hdr->id;
+               __entry->status = umad_hdr->status;
+               __entry->timeout_ms = umad_hdr->timeout_ms;
+               __entry->retires = umad_hdr->retries;
+               __entry->length = umad_hdr->length;
+               __entry->qpn = umad_hdr->qpn;
+               __entry->qkey = umad_hdr->qkey;
+               __entry->lid = umad_hdr->lid;
+               __entry->sl = umad_hdr->sl;
+               __entry->path_bits = umad_hdr->path_bits;
+               __entry->grh_present = umad_hdr->grh_present;
+               __entry->gid_index = umad_hdr->gid_index;
+               __entry->hop_limit = umad_hdr->hop_limit;
+               __entry->traffic_class = umad_hdr->traffic_class;
+               memcpy(__entry->gid, umad_hdr->gid, sizeof(umad_hdr->gid));
+               __entry->flow_label = umad_hdr->flow_label;
+               __entry->pkey_index = umad_hdr->pkey_index;
+
+               __entry->base_version = mad_hdr->base_version;
+               __entry->mgmt_class = mad_hdr->mgmt_class;
+               __entry->class_version = mad_hdr->class_version;
+               __entry->method = mad_hdr->method;
+               __entry->mad_status = mad_hdr->status;
+               __entry->class_specific = mad_hdr->class_specific;
+               __entry->tid = mad_hdr->tid;
+               __entry->attr_id = mad_hdr->attr_id;
+               __entry->attr_mod = mad_hdr->attr_mod;
+       ),
+
+       TP_printk("%d:%d umad_hdr: id 0x%08x status 0x%08x ms %u ret %u " \
+                 "len %u QP%u qkey 0x%08x lid 0x%04x sl %u path_bits 0x%x " \
+                 "grh 0x%x gidi %u hop_lim %u traf_cl %u gid %pI6c " \
+                 "flow 0x%08x pkeyi %u  MAD: base_ver 0x%x class 0x%x " \
+                 "class_ver 0x%x method 0x%x status 0x%04x " \
+                 "class_specific 0x%04x tid 0x%016llx attr_id 0x%04x " \
+                 "attr_mod 0x%08x ",
+               __entry->dev_index, __entry->port_num,
+               __entry->id, __entry->status, __entry->timeout_ms,
+               __entry->retires, __entry->length, be32_to_cpu(__entry->qpn),
+               be32_to_cpu(__entry->qkey), be16_to_cpu(__entry->lid),
+               __entry->sl, __entry->path_bits, __entry->grh_present,
+               __entry->gid_index, __entry->hop_limit,
+               __entry->traffic_class, &__entry->gid,
+               be32_to_cpu(__entry->flow_label), __entry->pkey_index,
+               __entry->base_version, __entry->mgmt_class,
+               __entry->class_version, __entry->method,
+               be16_to_cpu(__entry->mad_status),
+               be16_to_cpu(__entry->class_specific),
+               be64_to_cpu(__entry->tid), be16_to_cpu(__entry->attr_id),
+               be32_to_cpu(__entry->attr_mod)
+       )
+);
+
+DEFINE_EVENT(ib_umad_template, ib_umad_write,
+       TP_PROTO(struct ib_umad_file *file, struct ib_user_mad_hdr *umad_hdr,
+                struct ib_mad_hdr *mad_hdr),
+       TP_ARGS(file, umad_hdr, mad_hdr));
+
+DEFINE_EVENT(ib_umad_template, ib_umad_read_recv,
+       TP_PROTO(struct ib_umad_file *file, struct ib_user_mad_hdr *umad_hdr,
+                struct ib_mad_hdr *mad_hdr),
+       TP_ARGS(file, umad_hdr, mad_hdr));
+
+DEFINE_EVENT(ib_umad_template, ib_umad_read_send,
+       TP_PROTO(struct ib_umad_file *file, struct ib_user_mad_hdr *umad_hdr,
+                struct ib_mad_hdr *mad_hdr),
+       TP_ARGS(file, umad_hdr, mad_hdr));
+
+#endif /* _TRACE_IB_UMAD_H */
+
+#include <trace/define_trace.h>
diff --git a/include/uapi/rdma/efa-abi.h b/include/uapi/rdma/efa-abi.h
new file mode 100644 (file)
index 0000000..9599a2a
--- /dev/null
@@ -0,0 +1,101 @@
+/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
+/*
+ * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved.
+ */
+
+#ifndef EFA_ABI_USER_H
+#define EFA_ABI_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define EFA_UVERBS_ABI_VERSION 1
+
+/*
+ * Keep structs aligned to 8 bytes.
+ * Keep reserved fields as arrays of __u8 named reserved_XXX where XXX is the
+ * hex bit offset of the field.
+ */
+
+enum efa_ibv_user_cmds_supp_udata {
+       EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0,
+       EFA_USER_CMDS_SUPP_UDATA_CREATE_AH    = 1 << 1,
+};
+
+struct efa_ibv_alloc_ucontext_resp {
+       __u32 comp_mask;
+       __u32 cmds_supp_udata_mask;
+       __u16 sub_cqs_per_cq;
+       __u16 inline_buf_size;
+       __u32 max_llq_size; /* bytes */
+};
+
+struct efa_ibv_alloc_pd_resp {
+       __u32 comp_mask;
+       __u16 pdn;
+       __u8 reserved_30[2];
+};
+
+struct efa_ibv_create_cq {
+       __u32 comp_mask;
+       __u32 cq_entry_size;
+       __u16 num_sub_cqs;
+       __u8 reserved_50[6];
+};
+
+struct efa_ibv_create_cq_resp {
+       __u32 comp_mask;
+       __u8 reserved_20[4];
+       __aligned_u64 q_mmap_key;
+       __aligned_u64 q_mmap_size;
+       __u16 cq_idx;
+       __u8 reserved_d0[6];
+};
+
+enum {
+       EFA_QP_DRIVER_TYPE_SRD = 0,
+};
+
+struct efa_ibv_create_qp {
+       __u32 comp_mask;
+       __u32 rq_ring_size; /* bytes */
+       __u32 sq_ring_size; /* bytes */
+       __u32 driver_qp_type;
+};
+
+struct efa_ibv_create_qp_resp {
+       __u32 comp_mask;
+       /* the offset inside the page of the rq db */
+       __u32 rq_db_offset;
+       /* the offset inside the page of the sq db */
+       __u32 sq_db_offset;
+       /* the offset inside the page of descriptors buffer */
+       __u32 llq_desc_offset;
+       __aligned_u64 rq_mmap_key;
+       __aligned_u64 rq_mmap_size;
+       __aligned_u64 rq_db_mmap_key;
+       __aligned_u64 sq_db_mmap_key;
+       __aligned_u64 llq_desc_mmap_key;
+       __u16 send_sub_cq_idx;
+       __u16 recv_sub_cq_idx;
+       __u8 reserved_1e0[4];
+};
+
+struct efa_ibv_create_ah_resp {
+       __u32 comp_mask;
+       __u16 efa_address_handle;
+       __u8 reserved_30[2];
+};
+
+struct efa_ibv_ex_query_device_resp {
+       __u32 comp_mask;
+       __u32 max_sq_wr;
+       __u32 max_rq_wr;
+       __u16 max_sq_sge;
+       __u16 max_rq_sge;
+};
+
+#endif /* EFA_ABI_USER_H */
index f4d4010..624f5b5 100644 (file)
@@ -360,6 +360,7 @@ enum mlx5_ib_create_qp_resp_mask {
        MLX5_IB_CREATE_QP_RESP_MASK_TISN = 1UL << 1,
        MLX5_IB_CREATE_QP_RESP_MASK_RQN  = 1UL << 2,
        MLX5_IB_CREATE_QP_RESP_MASK_SQN  = 1UL << 3,
+       MLX5_IB_CREATE_QP_RESP_MASK_TIR_ICM_ADDR  = 1UL << 4,
 };
 
 struct mlx5_ib_create_qp_resp {
@@ -371,6 +372,7 @@ struct mlx5_ib_create_qp_resp {
        __u32   rqn;
        __u32   sqn;
        __u32   reserved1;
+       __u64   tir_icm_addr;
 };
 
 struct mlx5_ib_alloc_mw {
index 8149d22..d404c95 100644 (file)
@@ -44,6 +44,7 @@ enum mlx5_ib_create_flow_action_attrs {
 enum mlx5_ib_alloc_dm_attrs {
        MLX5_IB_ATTR_ALLOC_DM_RESP_START_OFFSET = (1U << UVERBS_ID_NS_SHIFT),
        MLX5_IB_ATTR_ALLOC_DM_RESP_PAGE_INDEX,
+       MLX5_IB_ATTR_ALLOC_DM_REQ_TYPE,
 };
 
 enum mlx5_ib_devx_methods {
@@ -144,6 +145,7 @@ enum mlx5_ib_flow_matcher_create_attrs {
        MLX5_IB_ATTR_FLOW_MATCHER_FLOW_TYPE,
        MLX5_IB_ATTR_FLOW_MATCHER_MATCH_CRITERIA,
        MLX5_IB_ATTR_FLOW_MATCHER_FLOW_FLAGS,
+       MLX5_IB_ATTR_FLOW_MATCHER_FT_TYPE,
 };
 
 enum mlx5_ib_flow_matcher_destroy_attrs {
index 4a70103..a8f34c2 100644 (file)
@@ -42,6 +42,7 @@ enum mlx5_ib_uapi_flow_action_flags {
 enum mlx5_ib_uapi_flow_table_type {
        MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_RX     = 0x0,
        MLX5_IB_UAPI_FLOW_TABLE_TYPE_NIC_TX     = 0x1,
+       MLX5_IB_UAPI_FLOW_TABLE_TYPE_FDB        = 0x2,
 };
 
 enum mlx5_ib_uapi_flow_action_packet_reformat_type {
@@ -56,5 +57,11 @@ struct mlx5_ib_uapi_devx_async_cmd_hdr {
        __u8            out_data[];
 };
 
+enum mlx5_ib_uapi_dm_type {
+       MLX5_IB_UAPI_DM_TYPE_MEMIC,
+       MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM,
+       MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM,
+};
+
 #endif
 
index 5cc5927..42a8bdc 100644 (file)
@@ -49,17 +49,6 @@ enum {
        RDMA_NL_IWPM_NUM_OPS
 };
 
-struct rdma_cm_id_stats {
-       __u32   qp_num;
-       __u32   bound_dev_if;
-       __u32   port_space;
-       __s32   pid;
-       __u8    cm_state;
-       __u8    node_type;
-       __u8    port_num;
-       __u8    qp_type;
-};
-
 enum {
        IWPM_NLA_REG_PID_UNSPEC = 0,
        IWPM_NLA_REG_PID_SEQ,
@@ -261,7 +250,10 @@ enum rdma_nldev_command {
 
        RDMA_NLDEV_CMD_PORT_GET, /* can dump */
 
-       /* 6 - 8 are free to use */
+       RDMA_NLDEV_CMD_SYS_GET, /* can dump */
+       RDMA_NLDEV_CMD_SYS_SET,
+
+       /* 8 is free to use */
 
        RDMA_NLDEV_CMD_RES_GET = 9, /* can dump */
 
@@ -472,6 +464,21 @@ enum rdma_nldev_attr {
         */
        RDMA_NLDEV_ATTR_LINK_TYPE,              /* string */
 
+       /*
+        * net namespace mode for rdma subsystem:
+        * either shared or exclusive among multiple net namespaces.
+        */
+       RDMA_NLDEV_SYS_ATTR_NETNS_MODE,         /* u8 */
+       /*
+        * Device protocol, e.g. ib, iw, usnic, roce and opa
+        */
+       RDMA_NLDEV_ATTR_DEV_PROTOCOL,           /* string */
+
+       /*
+        * File descriptor handle of the net namespace object
+        */
+       RDMA_NLDEV_NET_NS_FD,                   /* u32 */
+
        /*
         * Always the end
         */
index 06c34d9..26213f4 100644 (file)
@@ -102,6 +102,7 @@ enum rdma_driver_id {
        RDMA_DRIVER_RXE,
        RDMA_DRIVER_HFI1,
        RDMA_DRIVER_QIB,
+       RDMA_DRIVER_EFA,
 };
 
 #endif
index 7bdf98c..8a16c2d 100644 (file)
@@ -37,6 +37,8 @@
 #include <linux/device.h>
 #include <linux/netdevice.h>
 
+#include <rdma/ib_verbs.h>
+
 extern struct _ddebug __start___verbose[];
 extern struct _ddebug __stop___verbose[];
 
@@ -636,6 +638,41 @@ EXPORT_SYMBOL(__dynamic_netdev_dbg);
 
 #endif
 
+#if IS_ENABLED(CONFIG_INFINIBAND)
+
+void __dynamic_ibdev_dbg(struct _ddebug *descriptor,
+                        const struct ib_device *ibdev, const char *fmt, ...)
+{
+       struct va_format vaf;
+       va_list args;
+
+       va_start(args, fmt);
+
+       vaf.fmt = fmt;
+       vaf.va = &args;
+
+       if (ibdev && ibdev->dev.parent) {
+               char buf[PREFIX_SIZE];
+
+               dev_printk_emit(LOGLEVEL_DEBUG, ibdev->dev.parent,
+                               "%s%s %s %s: %pV",
+                               dynamic_emit_prefix(descriptor, buf),
+                               dev_driver_string(ibdev->dev.parent),
+                               dev_name(ibdev->dev.parent),
+                               dev_name(&ibdev->dev),
+                               &vaf);
+       } else if (ibdev) {
+               printk(KERN_DEBUG "%s: %pV", dev_name(&ibdev->dev), &vaf);
+       } else {
+               printk(KERN_DEBUG "(NULL ib_device): %pV", &vaf);
+       }
+
+       va_end(args);
+}
+EXPORT_SYMBOL(__dynamic_ibdev_dbg);
+
+#endif
+
 #define DDEBUG_STRING_SIZE 1024
 static __initdata char ddebug_setup_string[DDEBUG_STRING_SIZE];
 
index 53f429c..d14ca4a 100644 (file)
@@ -146,18 +146,13 @@ out:
 static int smc_ib_fill_mac(struct smc_ib_device *smcibdev, u8 ibport)
 {
        const struct ib_gid_attr *attr;
-       int rc = 0;
+       int rc;
 
        attr = rdma_get_gid_attr(smcibdev->ibdev, ibport, 0);
        if (IS_ERR(attr))
                return -ENODEV;
 
-       if (attr->ndev)
-               memcpy(smcibdev->mac[ibport - 1], attr->ndev->dev_addr,
-                      ETH_ALEN);
-       else
-               rc = -ENODEV;
-
+       rc = rdma_read_gid_l2_fields(attr, NULL, smcibdev->mac[ibport - 1]);
        rdma_put_gid_attr(attr);
        return rc;
 }
@@ -185,6 +180,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
                         unsigned short vlan_id, u8 gid[], u8 *sgid_index)
 {
        const struct ib_gid_attr *attr;
+       const struct net_device *ndev;
        int i;
 
        for (i = 0; i < smcibdev->pattr[ibport - 1].gid_tbl_len; i++) {
@@ -192,11 +188,14 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
                if (IS_ERR(attr))
                        continue;
 
-               if (attr->ndev &&
+               rcu_read_lock();
+               ndev = rdma_read_gid_attr_ndev_rcu(attr);
+               if (!IS_ERR(ndev) &&
                    ((!vlan_id && !is_vlan_dev(attr->ndev)) ||
                     (vlan_id && is_vlan_dev(attr->ndev) &&
                      vlan_dev_vlan_id(attr->ndev) == vlan_id)) &&
                    attr->gid_type == IB_GID_TYPE_ROCE) {
+                       rcu_read_unlock();
                        if (gid)
                                memcpy(gid, &attr->gid, SMC_GID_SIZE);
                        if (sgid_index)
@@ -204,6 +203,7 @@ int smc_ib_determine_gid(struct smc_ib_device *smcibdev, u8 ibport,
                        rdma_put_gid_attr(attr);
                        return 0;
                }
+               rcu_read_unlock();
                rdma_put_gid_attr(attr);
        }
        return -ENODEV;
index 65e667b..4f0a1cd 100644 (file)
@@ -52,6 +52,7 @@ hostprogs-y += xdpsock
 hostprogs-y += xdp_fwd
 hostprogs-y += task_fd_query
 hostprogs-y += xdp_sample_pkts
+hostprogs-y += ibumad
 hostprogs-y += hbm
 
 # Libbpf dependencies
@@ -108,6 +109,7 @@ xdpsock-objs := xdpsock_user.o
 xdp_fwd-objs := xdp_fwd_user.o
 task_fd_query-objs := bpf_load.o task_fd_query_user.o $(TRACE_HELPERS)
 xdp_sample_pkts-objs := xdp_sample_pkts_user.o $(TRACE_HELPERS)
+ibumad-objs := bpf_load.o ibumad_user.o $(TRACE_HELPERS)
 hbm-objs := bpf_load.o hbm.o $(CGROUP_HELPERS)
 
 # Tell kbuild to always build the programs
@@ -166,6 +168,7 @@ always += xdp_adjust_tail_kern.o
 always += xdp_fwd_kern.o
 always += task_fd_query_kern.o
 always += xdp_sample_pkts_kern.o
+always += ibumad_kern.o
 always += hbm_out_kern.o
 
 KBUILD_HOSTCFLAGS += -I$(objtree)/usr/include
diff --git a/samples/bpf/ibumad_kern.c b/samples/bpf/ibumad_kern.c
new file mode 100644 (file)
index 0000000..38b2b3f
--- /dev/null
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+/**
+ * ibumad BPF sample kernel side
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Copyright(c) 2018 Ira Weiny, Intel Corporation
+ */
+
+#define KBUILD_MODNAME "ibumad_count_pkts_by_class"
+#include <uapi/linux/bpf.h>
+
+#include "bpf_helpers.h"
+
+
+struct bpf_map_def SEC("maps") read_count = {
+       .type        = BPF_MAP_TYPE_ARRAY,
+       .key_size    = sizeof(u32), /* class; u32 required */
+       .value_size  = sizeof(u64), /* count of mads read */
+       .max_entries = 256, /* Room for all Classes */
+};
+
+struct bpf_map_def SEC("maps") write_count = {
+       .type        = BPF_MAP_TYPE_ARRAY,
+       .key_size    = sizeof(u32), /* class; u32 required */
+       .value_size  = sizeof(u64), /* count of mads written */
+       .max_entries = 256, /* Room for all Classes */
+};
+
+#undef DEBUG
+#ifdef DEBUG
+#define bpf_debug(fmt, ...)                         \
+({                                                  \
+       char ____fmt[] = fmt;                       \
+       bpf_trace_printk(____fmt, sizeof(____fmt),  \
+                        ##__VA_ARGS__);            \
+})
+#else
+#define bpf_debug(fmt, ...)
+#endif
+
+/* Taken from the current format defined in
+ * include/trace/events/ib_umad.h
+ * and
+ * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_read/format
+ * /sys/kernel/debug/tracing/events/ib_umad/ib_umad_write/format
+ */
+struct ib_umad_rw_args {
+       u64 pad;
+       u8 port_num;
+       u8 sl;
+       u8 path_bits;
+       u8 grh_present;
+       u32 id;
+       u32 status;
+       u32 timeout_ms;
+       u32 retires;
+       u32 length;
+       u32 qpn;
+       u32 qkey;
+       u8 gid_index;
+       u8 hop_limit;
+       u16 lid;
+       u16 attr_id;
+       u16 pkey_index;
+       u8 base_version;
+       u8 mgmt_class;
+       u8 class_version;
+       u8 method;
+       u32 flow_label;
+       u16 mad_status;
+       u16 class_specific;
+       u32 attr_mod;
+       u64 tid;
+       u8 gid[16];
+       u32 dev_index;
+       u8 traffic_class;
+};
+
+SEC("tracepoint/ib_umad/ib_umad_read_recv")
+int on_ib_umad_read_recv(struct ib_umad_rw_args *ctx)
+{
+       u64 zero = 0, *val;
+       u8 class = ctx->mgmt_class;
+
+       bpf_debug("ib_umad read recv : class 0x%x\n", class);
+
+       val = bpf_map_lookup_elem(&read_count, &class);
+       if (!val) {
+               bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
+               val = bpf_map_lookup_elem(&read_count, &class);
+               if (!val)
+                       return 0;
+       }
+
+       (*val) += 1;
+
+       return 0;
+}
+SEC("tracepoint/ib_umad/ib_umad_read_send")
+int on_ib_umad_read_send(struct ib_umad_rw_args *ctx)
+{
+       u64 zero = 0, *val;
+       u8 class = ctx->mgmt_class;
+
+       bpf_debug("ib_umad read send : class 0x%x\n", class);
+
+       val = bpf_map_lookup_elem(&read_count, &class);
+       if (!val) {
+               bpf_map_update_elem(&read_count, &class, &zero, BPF_NOEXIST);
+               val = bpf_map_lookup_elem(&read_count, &class);
+               if (!val)
+                       return 0;
+       }
+
+       (*val) += 1;
+
+       return 0;
+}
+SEC("tracepoint/ib_umad/ib_umad_write")
+int on_ib_umad_write(struct ib_umad_rw_args *ctx)
+{
+       u64 zero = 0, *val;
+       u8 class = ctx->mgmt_class;
+
+       bpf_debug("ib_umad write : class 0x%x\n", class);
+
+       val = bpf_map_lookup_elem(&write_count, &class);
+       if (!val) {
+               bpf_map_update_elem(&write_count, &class, &zero, BPF_NOEXIST);
+               val = bpf_map_lookup_elem(&write_count, &class);
+               if (!val)
+                       return 0;
+       }
+
+       (*val) += 1;
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
diff --git a/samples/bpf/ibumad_user.c b/samples/bpf/ibumad_user.c
new file mode 100644 (file)
index 0000000..097d761
--- /dev/null
@@ -0,0 +1,122 @@
+// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
+
+/**
+ * ibumad BPF sample user side
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * Copyright(c) 2018 Ira Weiny, Intel Corporation
+ */
+
+#include <linux/bpf.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <limits.h>
+
+#include <sys/resource.h>
+#include <getopt.h>
+#include <net/if.h>
+
+#include "bpf_load.h"
+#include "bpf_util.h"
+#include "bpf/libbpf.h"
+
+static void dump_counts(int fd)
+{
+       __u32 key;
+       __u64 value;
+
+       for (key = 0; key < 256; key++) {
+               if (bpf_map_lookup_elem(fd, &key, &value)) {
+                       printf("failed to read key %u\n", key);
+                       continue;
+               }
+               if (value)
+                       printf("0x%02x : %llu\n", key, value);
+       }
+}
+
+static void dump_all_counts(void)
+{
+       printf("Read 'Class : count'\n");
+       dump_counts(map_fd[0]);
+       printf("Write 'Class : count'\n");
+       dump_counts(map_fd[1]);
+}
+
+static void dump_exit(int sig)
+{
+       dump_all_counts();
+       exit(0);
+}
+
+static const struct option long_options[] = {
+       {"help",      no_argument,       NULL, 'h'},
+       {"delay",     required_argument, NULL, 'd'},
+};
+
+static void usage(char *cmd)
+{
+       printf("eBPF test program to count packets from various IP addresses\n"
+               "Usage: %s <options>\n"
+               "       --help,   -h  this menu\n"
+               "       --delay,  -d  <delay>  wait <delay> sec between prints [1 - 1000000]\n"
+               , cmd
+               );
+}
+
+int main(int argc, char **argv)
+{
+       unsigned long delay = 5;
+       int longindex = 0;
+       int opt;
+       char bpf_file[256];
+
+       /* Create the eBPF kernel code path name.
+        * This follows the pattern of all of the other bpf samples
+        */
+       snprintf(bpf_file, sizeof(bpf_file), "%s_kern.o", argv[0]);
+
+       /* Do one final dump when exiting */
+       signal(SIGINT, dump_exit);
+       signal(SIGTERM, dump_exit);
+
+       while ((opt = getopt_long(argc, argv, "hd:rSw",
+                                 long_options, &longindex)) != -1) {
+               switch (opt) {
+               case 'd':
+                       delay = strtoul(optarg, NULL, 0);
+                       if (delay == ULONG_MAX || delay < 0 ||
+                           delay > 1000000) {
+                               fprintf(stderr, "ERROR: invalid delay : %s\n",
+                                       optarg);
+                               usage(argv[0]);
+                               return 1;
+                       }
+                       break;
+               default:
+               case 'h':
+                       usage(argv[0]);
+                       return 1;
+               }
+       }
+
+       if (load_bpf_file(bpf_file)) {
+               fprintf(stderr, "ERROR: failed to load eBPF from file : %s\n",
+                       bpf_file);
+               return 1;
+       }
+
+       while (1) {
+               sleep(delay);
+               dump_all_counts();
+       }
+
+       return 0;
+}