Merge branch 'dynamic_sg' into rdma.git for-next
authorJason Gunthorpe <jgg@nvidia.com>
Fri, 9 Oct 2020 15:56:02 +0000 (12:56 -0300)
committerJason Gunthorpe <jgg@nvidia.com>
Fri, 16 Oct 2020 15:40:58 +0000 (12:40 -0300)
From Maor Gottlieb says:

====================
This series extends __sg_alloc_table_from_pages to allow chaining of new
pages to an already initialized SG table.

This allows for drivers to utilize the optimization of merging contiguous
pages without a need to pre allocate all the pages and hold them in a very
large temporary buffer prior to the call to SG table initialization.

The last patch changes the Infiniband core to use the new API. It removes
duplicate functionality from the code and benefits from the optimization
of allocating dynamic SG table from pages.

In huge pages system of 2MB page size, without this change, the SG table
would contain x512 SG entries.
====================

* branch 'dynamic_sg':
  RDMA/umem: Move to allocate SG table from pages
  lib/scatterlist: Add support in dynamic allocation of SG table from pages
  tools/testing/scatterlist: Show errors in human readable form
  tools/testing/scatterlist: Rejuvenate bit-rotten test

1  2 
MAINTAINERS
drivers/infiniband/core/device.c
drivers/infiniband/core/umem.c

diff --combined MAINTAINERS
@@@ -4247,6 -4247,7 +4247,6 @@@ F:      drivers/net/ethernet/cisco/enic
  CISCO VIC LOW LATENCY NIC DRIVER
  M:    Christian Benvenuti <benve@cisco.com>
  M:    Nelson Escobar <neescoba@cisco.com>
 -M:    Parvi Kaustubhi <pkaustub@cisco.com>
  S:    Supported
  F:    drivers/infiniband/hw/usnic/
  
@@@ -4407,12 -4408,6 +4407,6 @@@ T:     git git://git.infradead.org/users/hc
  F:    fs/configfs/
  F:    include/linux/configfs.h
  
- CONNECTOR
- M:    Evgeniy Polyakov <zbr@ioremap.net>
- L:    netdev@vger.kernel.org
- S:    Maintained
- F:    drivers/connector/
  CONSOLE SUBSYSTEM
  M:    Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  S:    Supported
@@@ -6179,7 -6174,7 +6173,7 @@@ F:      Documentation/devicetree/bindings/ed
  F:    drivers/edac/aspeed_edac.c
  
  EDAC-BLUEFIELD
- M:    Shravan Kumar Ramani <sramani@nvidia.com>
+ M:    Shravan Kumar Ramani <shravankr@nvidia.com>
  S:    Supported
  F:    drivers/edac/bluefield_edac.c
  
@@@ -7750,8 -7745,8 +7744,8 @@@ F:      include/linux/cciss*.
  F:    include/uapi/linux/cciss*.h
  
  HFI1 DRIVER
 -M:    Mike Marciniszyn <mike.marciniszyn@intel.com>
 -M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
 +M:    Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
 +M:    Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
  L:    linux-rdma@vger.kernel.org
  S:    Supported
  F:    drivers/infiniband/hw/hfi1
@@@ -8328,8 -8323,9 +8322,9 @@@ S:      Supporte
  F:    drivers/pci/hotplug/rpaphp*
  
  IBM Power SRIOV Virtual NIC Device Driver
- M:    Thomas Falcon <tlfalcon@linux.ibm.com>
- M:    John Allen <jallen@linux.ibm.com>
+ M:    Dany Madden <drt@linux.ibm.com>
+ M:    Lijun Pan <ljp@linux.ibm.com>
+ M:    Sukadev Bhattiprolu <sukadev@linux.ibm.com>
  L:    netdev@vger.kernel.org
  S:    Supported
  F:    drivers/net/ethernet/ibm/ibmvnic.*
@@@ -8343,7 -8339,7 +8338,7 @@@ F:      arch/powerpc/platforms/powernv/copy-
  F:    arch/powerpc/platforms/powernv/vas*
  
  IBM Power Virtual Ethernet Device Driver
- M:    Thomas Falcon <tlfalcon@linux.ibm.com>
+ M:    Cristobal Forno <cforno12@linux.ibm.com>
  L:    netdev@vger.kernel.org
  S:    Supported
  F:    drivers/net/ethernet/ibm/ibmveth.*
@@@ -9250,7 -9246,7 +9245,7 @@@ F:      drivers/firmware/iscsi_ibft
  
  ISCSI EXTENSIONS FOR RDMA (ISER) INITIATOR
  M:    Sagi Grimberg <sagi@grimberg.me>
- M:    Max Gurtovoy <maxg@nvidia.com>
+ M:    Max Gurtovoy <mgurtovoy@nvidia.com>
  L:    linux-rdma@vger.kernel.org
  S:    Supported
  W:    http://www.openfabrics.org
@@@ -11041,6 -11037,7 +11036,7 @@@ F:   drivers/char/hw_random/mtk-rng.
  
  MEDIATEK SWITCH DRIVER
  M:    Sean Wang <sean.wang@mediatek.com>
+ M:    Landen Chao <Landen.Chao@mediatek.com>
  L:    netdev@vger.kernel.org
  S:    Maintained
  F:    drivers/net/dsa/mt7530.*
@@@ -12054,6 -12051,7 +12050,7 @@@ Q:   http://patchwork.ozlabs.org/project/
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git
  T:    git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git
  F:    Documentation/devicetree/bindings/net/
+ F:    drivers/connector/
  F:    drivers/net/
  F:    include/linux/etherdevice.h
  F:    include/linux/fcdevice.h
@@@ -12885,8 -12883,8 +12882,8 @@@ S:   Maintaine
  F:    drivers/char/hw_random/optee-rng.c
  
  OPA-VNIC DRIVER
 -M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
 -M:    Niranjana Vishwanathapura <niranjana.vishwanathapura@intel.com>
 +M:    Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
 +M:    Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
  L:    linux-rdma@vger.kernel.org
  S:    Supported
  F:    drivers/infiniband/ulp/opa_vnic
@@@ -13184,6 -13182,7 +13181,7 @@@ F:   drivers/firmware/pcdp.
  
  PCI DRIVER FOR AARDVARK (Marvell Armada 3700)
  M:    Thomas Petazzoni <thomas.petazzoni@bootlin.com>
+ M:    Pali Rohár <pali@kernel.org>
  L:    linux-pci@vger.kernel.org
  L:    linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
  S:    Maintained
@@@ -14184,8 -14183,8 +14182,8 @@@ F:   drivers/firmware/qemu_fw_cfg.
  F:    include/uapi/linux/qemu_fw_cfg.h
  
  QIB DRIVER
 -M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
 -M:    Mike Marciniszyn <mike.marciniszyn@intel.com>
 +M:    Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
 +M:    Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
  L:    linux-rdma@vger.kernel.org
  S:    Supported
  F:    drivers/infiniband/hw/qib/
@@@ -14607,8 -14606,8 +14605,8 @@@ S:   Maintaine
  F:    drivers/net/ethernet/rdc/r6040.c
  
  RDMAVT - RDMA verbs software
 -M:    Dennis Dalessandro <dennis.dalessandro@intel.com>
 -M:    Mike Marciniszyn <mike.marciniszyn@intel.com>
 +M:    Dennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
 +M:    Mike Marciniszyn <mike.marciniszyn@cornelisnetworks.com>
  L:    linux-rdma@vger.kernel.org
  S:    Supported
  F:    drivers/infiniband/sw/rdmavt
@@@ -16156,7 -16155,7 +16154,7 @@@ M:   Leon Luo <leonl@leopardimaging.com
  L:    linux-media@vger.kernel.org
  S:    Maintained
  T:    git git://linuxtv.org/media_tree.git
- F:    Documentation/devicetree/bindings/media/i2c/imx274.txt
+ F:    Documentation/devicetree/bindings/media/i2c/sony,imx274.yaml
  F:    drivers/media/i2c/imx274.c
  
  SONY IMX290 SENSOR DRIVER
@@@ -1285,6 -1285,8 +1285,8 @@@ static void disable_device(struct ib_de
                remove_client_context(device, cid);
        }
  
+       ib_cq_pool_destroy(device);
        /* Pairs with refcount_set in enable_device */
        ib_device_put(device);
        wait_for_completion(&device->unreg_completion);
@@@ -1328,6 -1330,8 +1330,8 @@@ static int enable_device_and_get(struc
                        goto out;
        }
  
+       ib_cq_pool_init(device);
        down_read(&clients_rwsem);
        xa_for_each_marked (&clients, index, client, CLIENT_REGISTERED) {
                ret = add_client_context(device, client);
@@@ -1400,7 -1404,6 +1404,6 @@@ int ib_register_device(struct ib_devic
                goto dev_cleanup;
        }
  
-       ib_cq_pool_init(device);
        ret = enable_device_and_get(device);
        dev_set_uevent_suppress(&device->dev, false);
        /* Mark for userspace that device is ready */
@@@ -1455,7 -1458,6 +1458,6 @@@ static void __ib_unregister_device(stru
                goto out;
  
        disable_device(ib_dev);
-       ib_cq_pool_destroy(ib_dev);
  
        /* Expedite removing unregistered pointers from the hash table */
        free_netdevs(ib_dev);
@@@ -2695,9 -2697,7 +2697,9 @@@ void ib_set_device_ops(struct ib_devic
        SET_OBJ_SIZE(dev_ops, ib_ah);
        SET_OBJ_SIZE(dev_ops, ib_counters);
        SET_OBJ_SIZE(dev_ops, ib_cq);
 +      SET_OBJ_SIZE(dev_ops, ib_mw);
        SET_OBJ_SIZE(dev_ops, ib_pd);
 +      SET_OBJ_SIZE(dev_ops, ib_rwq_ind_table);
        SET_OBJ_SIZE(dev_ops, ib_srq);
        SET_OBJ_SIZE(dev_ops, ib_ucontext);
        SET_OBJ_SIZE(dev_ops, ib_xrcd);
@@@ -39,7 -39,6 +39,7 @@@
  #include <linux/export.h>
  #include <linux/slab.h>
  #include <linux/pagemap.h>
 +#include <linux/count_zeros.h>
  #include <rdma/ib_umem_odp.h>
  
  #include "uverbs.h"
@@@ -61,73 -60,6 +61,6 @@@ static void __ib_umem_release(struct ib
        sg_free_table(&umem->sg_head);
  }
  
- /* ib_umem_add_sg_table - Add N contiguous pages to scatter table
-  *
-  * sg: current scatterlist entry
-  * page_list: array of npage struct page pointers
-  * npages: number of pages in page_list
-  * max_seg_sz: maximum segment size in bytes
-  * nents: [out] number of entries in the scatterlist
-  *
-  * Return new end of scatterlist
-  */
- static struct scatterlist *ib_umem_add_sg_table(struct scatterlist *sg,
-                                               struct page **page_list,
-                                               unsigned long npages,
-                                               unsigned int max_seg_sz,
-                                               int *nents)
- {
-       unsigned long first_pfn;
-       unsigned long i = 0;
-       bool update_cur_sg = false;
-       bool first = !sg_page(sg);
-       /* Check if new page_list is contiguous with end of previous page_list.
-        * sg->length here is a multiple of PAGE_SIZE and sg->offset is 0.
-        */
-       if (!first && (page_to_pfn(sg_page(sg)) + (sg->length >> PAGE_SHIFT) ==
-                      page_to_pfn(page_list[0])))
-               update_cur_sg = true;
-       while (i != npages) {
-               unsigned long len;
-               struct page *first_page = page_list[i];
-               first_pfn = page_to_pfn(first_page);
-               /* Compute the number of contiguous pages we have starting
-                * at i
-                */
-               for (len = 0; i != npages &&
-                             first_pfn + len == page_to_pfn(page_list[i]) &&
-                             len < (max_seg_sz >> PAGE_SHIFT);
-                    len++)
-                       i++;
-               /* Squash N contiguous pages from page_list into current sge */
-               if (update_cur_sg) {
-                       if ((max_seg_sz - sg->length) >= (len << PAGE_SHIFT)) {
-                               sg_set_page(sg, sg_page(sg),
-                                           sg->length + (len << PAGE_SHIFT),
-                                           0);
-                               update_cur_sg = false;
-                               continue;
-                       }
-                       update_cur_sg = false;
-               }
-               /* Squash N contiguous pages into next sge or first sge */
-               if (!first)
-                       sg = sg_next(sg);
-               (*nents)++;
-               sg_set_page(sg, first_page, len << PAGE_SHIFT, 0);
-               first = false;
-       }
-       return sg;
- }
  /**
   * ib_umem_find_best_pgsz - Find best HW page size to use for this MR
   *
@@@ -147,28 -79,18 +80,28 @@@ unsigned long ib_umem_find_best_pgsz(st
                                     unsigned long virt)
  {
        struct scatterlist *sg;
 -      unsigned int best_pg_bit;
        unsigned long va, pgoff;
        dma_addr_t mask;
        int i;
  
 +      /* rdma_for_each_block() has a bug if the page size is smaller than the
 +       * page size used to build the umem. For now prevent smaller page sizes
 +       * from being returned.
 +       */
 +      pgsz_bitmap &= GENMASK(BITS_PER_LONG - 1, PAGE_SHIFT);
 +
        /* At minimum, drivers must support PAGE_SIZE or smaller */
        if (WARN_ON(!(pgsz_bitmap & GENMASK(PAGE_SHIFT, 0))))
                return 0;
  
 -      va = virt;
 -      /* max page size not to exceed MR length */
 -      mask = roundup_pow_of_two(umem->length);
 +      umem->iova = va = virt;
 +      /* The best result is the smallest page size that results in the minimum
 +       * number of required pages. Compute the largest page size that could
 +       * work based on VA address bits that don't change.
 +       */
 +      mask = pgsz_bitmap &
 +             GENMASK(BITS_PER_LONG - 1,
 +                     bits_per((umem->length - 1 + virt) ^ virt));
        /* offset into first SGL */
        pgoff = umem->address & ~PAGE_MASK;
  
                        mask |= va;
                pgoff = 0;
        }
 -      best_pg_bit = rdma_find_pg_bit(mask, pgsz_bitmap);
  
 -      return BIT_ULL(best_pg_bit);
 +      /* The mask accumulates 1's in each position where the VA and physical
 +       * address differ, thus the length of trailing 0 is the largest page
 +       * size that can pass the VA through to the physical.
 +       */
 +      if (mask)
 +              pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0);
 +      return rounddown_pow_of_two(pgsz_bitmap);
  }
  EXPORT_SYMBOL(ib_umem_find_best_pgsz);
  
@@@ -217,7 -134,7 +150,7 @@@ struct ib_umem *ib_umem_get(struct ib_d
        struct mm_struct *mm;
        unsigned long npages;
        int ret;
-       struct scatterlist *sg;
+       struct scatterlist *sg = NULL;
        unsigned int gup_flags = FOLL_WRITE;
  
        /*
        umem->ibdev      = device;
        umem->length     = size;
        umem->address    = addr;
 +      /*
 +       * Drivers should call ib_umem_find_best_pgsz() to set the iova
 +       * correctly.
 +       */
 +      umem->iova = addr;
        umem->writable   = ib_access_writable(access);
        umem->owning_mm = mm = current->mm;
        mmgrab(mm);
  
        cur_base = addr & PAGE_MASK;
  
-       ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
-       if (ret)
-               goto vma;
        if (!umem->writable)
                gup_flags |= FOLL_FORCE;
  
-       sg = umem->sg_head.sgl;
        while (npages) {
                cond_resched();
                ret = pin_user_pages_fast(cur_base,
                        goto umem_release;
  
                cur_base += ret * PAGE_SIZE;
-               npages   -= ret;
-               sg = ib_umem_add_sg_table(sg, page_list, ret,
-                       dma_get_max_seg_size(device->dma_device),
-                       &umem->sg_nents);
+               npages -= ret;
+               sg = __sg_alloc_table_from_pages(
+                       &umem->sg_head, page_list, ret, 0, ret << PAGE_SHIFT,
+                       dma_get_max_seg_size(device->dma_device), sg, npages,
+                       GFP_KERNEL);
+               umem->sg_nents = umem->sg_head.nents;
+               if (IS_ERR(sg)) {
+                       unpin_user_pages_dirty_lock(page_list, ret, 0);
+                       ret = PTR_ERR(sg);
+                       goto umem_release;
+               }
        }
  
-       sg_mark_end(sg);
        if (access & IB_ACCESS_RELAXED_ORDERING)
                dma_attr |= DMA_ATTR_WEAK_ORDERING;
  
  
  umem_release:
        __ib_umem_release(device, umem, 0);
- vma:
        atomic64_sub(ib_umem_num_pages(umem), &mm->pinned_vm);
  out:
        free_page((unsigned long) page_list);
@@@ -350,6 -259,18 +280,6 @@@ void ib_umem_release(struct ib_umem *um
  }
  EXPORT_SYMBOL(ib_umem_release);
  
 -int ib_umem_page_count(struct ib_umem *umem)
 -{
 -      int i, n = 0;
 -      struct scatterlist *sg;
 -
 -      for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
 -              n += sg_dma_len(sg) >> PAGE_SHIFT;
 -
 -      return n;
 -}
 -EXPORT_SYMBOL(ib_umem_page_count);
 -
  /*
   * Copy from the given ib_umem's pages to the given buffer.
   *