Merge tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
authorLinus Torvalds <torvalds@linux-foundation.org>
Sun, 29 Sep 2019 17:00:14 +0000 (10:00 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Sun, 29 Sep 2019 17:00:14 +0000 (10:00 -0700)
Pull iommu fixes from Joerg Roedel:
 "A couple of fixes for the AMD IOMMU driver have piled up:

   - Some fixes for the reworked IO page-table which caused memory leaks
     or did not allow to downgrade mappings under some conditions.

   - Locking fixes to fix a couple of possible races around accessing
     'struct protection_domain'. The races got introduced when the
     dma-ops path became lock-less in the fast-path"

* tag 'iommu-fixes-5.4-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/joro/iommu:
  iommu/amd: Lock code paths traversing protection_domain->dev_list
  iommu/amd: Lock dev_data in attach/detach code paths
  iommu/amd: Check for busy devices earlier in attach_device()
  iommu/amd: Take domain->lock for complete attach/detach path
  iommu/amd: Remove amd_iommu_devtable_lock
  iommu/amd: Remove domain->updated
  iommu/amd: Wait for completion of IOTLB flush in attach_device
  iommu/amd: Unmap all L7 PTEs when downgrading page-sizes
  iommu/amd: Introduce first_pte_l7() helper
  iommu/amd: Fix downgrading default page-sizes in alloc_pte()
  iommu/amd: Fix pages leak in free_pagetable()

1  2 
drivers/iommu/amd_iommu.c

@@@ -70,7 -70,6 +70,6 @@@
   */
  #define AMD_IOMMU_PGSIZES     ((~0xFFFUL) & ~(2ULL << 38))
  
- static DEFINE_SPINLOCK(amd_iommu_devtable_lock);
  static DEFINE_SPINLOCK(pd_bitmap_lock);
  
  /* List of all available dev_data structures */
@@@ -202,6 -201,7 +201,7 @@@ static struct iommu_dev_data *alloc_dev
        if (!dev_data)
                return NULL;
  
+       spin_lock_init(&dev_data->lock);
        dev_data->devid = devid;
        ratelimit_default_init(&dev_data->rs);
  
@@@ -501,6 -501,29 +501,29 @@@ static void iommu_uninit_device(struct 
         */
  }
  
+ /*
+  * Helper function to get the first pte of a large mapping
+  */
+ static u64 *first_pte_l7(u64 *pte, unsigned long *page_size,
+                        unsigned long *count)
+ {
+       unsigned long pte_mask, pg_size, cnt;
+       u64 *fpte;
+       pg_size  = PTE_PAGE_SIZE(*pte);
+       cnt      = PAGE_SIZE_PTE_COUNT(pg_size);
+       pte_mask = ~((cnt << 3) - 1);
+       fpte     = (u64 *)(((unsigned long)pte) & pte_mask);
+       if (page_size)
+               *page_size = pg_size;
+       if (count)
+               *count = cnt;
+       return fpte;
+ }
  /****************************************************************************
   *
   * Interrupt handling functions
@@@ -1311,8 -1334,12 +1334,12 @@@ static void domain_flush_np_cache(struc
                dma_addr_t iova, size_t size)
  {
        if (unlikely(amd_iommu_np_cache)) {
+               unsigned long flags;
+               spin_lock_irqsave(&domain->lock, flags);
                domain_flush_pages(domain, iova, size);
                domain_flush_complete(domain);
+               spin_unlock_irqrestore(&domain->lock, flags);
        }
  }
  
@@@ -1425,7 -1452,7 +1452,7 @@@ static void free_pagetable(struct prote
        BUG_ON(domain->mode < PAGE_MODE_NONE ||
               domain->mode > PAGE_MODE_6_LEVEL);
  
-       free_sub_pt(root, domain->mode, freelist);
+       freelist = free_sub_pt(root, domain->mode, freelist);
  
        free_page_list(freelist);
  }
   * another level increases the size of the address space by 9 bits to a size up
   * to 64 bits.
   */
- static void increase_address_space(struct protection_domain *domain,
+ static bool increase_address_space(struct protection_domain *domain,
                                   gfp_t gfp)
  {
        unsigned long flags;
+       bool ret = false;
        u64 *pte;
  
        spin_lock_irqsave(&domain->lock, flags);
                                        iommu_virt_to_phys(domain->pt_root));
        domain->pt_root  = pte;
        domain->mode    += 1;
-       domain->updated  = true;
+       ret = true;
  
  out:
        spin_unlock_irqrestore(&domain->lock, flags);
  
-       return;
+       return ret;
  }
  
  static u64 *alloc_pte(struct protection_domain *domain,
                      unsigned long address,
                      unsigned long page_size,
                      u64 **pte_page,
-                     gfp_t gfp)
+                     gfp_t gfp,
+                     bool *updated)
  {
        int level, end_lvl;
        u64 *pte, *page;
        BUG_ON(!is_power_of_2(page_size));
  
        while (address > PM_LEVEL_SIZE(domain->mode))
-               increase_address_space(domain, gfp);
+               *updated = increase_address_space(domain, gfp) || *updated;
  
        level   = domain->mode - 1;
        pte     = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
                __pte     = *pte;
                pte_level = PM_PTE_LEVEL(__pte);
  
-               if (!IOMMU_PTE_PRESENT(__pte) ||
+               /*
+                * If we replace a series of large PTEs, we need
+                * to tear down all of them.
+                */
+               if (IOMMU_PTE_PRESENT(__pte) &&
                    pte_level == PAGE_MODE_7_LEVEL) {
+                       unsigned long count, i;
+                       u64 *lpte;
+                       lpte = first_pte_l7(pte, NULL, &count);
+                       /*
+                        * Unmap the replicated PTEs that still match the
+                        * original large mapping
+                        */
+                       for (i = 0; i < count; ++i)
+                               cmpxchg64(&lpte[i], __pte, 0ULL);
+                       *updated = true;
+                       continue;
+               }
+               if (!IOMMU_PTE_PRESENT(__pte) ||
+                   pte_level == PAGE_MODE_NONE) {
                        page = (u64 *)get_zeroed_page(gfp);
                        if (!page)
                                return NULL;
  
                        /* pte could have been changed somewhere. */
                        if (cmpxchg64(pte, __pte, __npte) != __pte)
                                free_page((unsigned long)page);
-                       else if (pte_level == PAGE_MODE_7_LEVEL)
-                               domain->updated = true;
+                       else if (IOMMU_PTE_PRESENT(__pte))
+                               *updated = true;
  
                        continue;
                }
@@@ -1566,17 -1619,12 +1619,12 @@@ static u64 *fetch_pte(struct protection
                *page_size = PTE_LEVEL_PAGE_SIZE(level);
        }
  
-       if (PM_PTE_LEVEL(*pte) == 0x07) {
-               unsigned long pte_mask;
-               /*
-                * If we have a series of large PTEs, make
-                * sure to return a pointer to the first one.
-                */
-               *page_size = pte_mask = PTE_PAGE_SIZE(*pte);
-               pte_mask   = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
-               pte        = (u64 *)(((unsigned long)pte) & pte_mask);
-       }
+       /*
+        * If we have a series of large PTEs, make
+        * sure to return a pointer to the first one.
+        */
+       if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL)
+               pte = first_pte_l7(pte, page_size, NULL);
  
        return pte;
  }
@@@ -1615,26 -1663,29 +1663,29 @@@ static int iommu_map_page(struct protec
                          gfp_t gfp)
  {
        struct page *freelist = NULL;
+       bool updated = false;
        u64 __pte, *pte;
-       int i, count;
+       int ret, i, count;
  
        BUG_ON(!IS_ALIGNED(bus_addr, page_size));
        BUG_ON(!IS_ALIGNED(phys_addr, page_size));
  
+       ret = -EINVAL;
        if (!(prot & IOMMU_PROT_MASK))
-               return -EINVAL;
+               goto out;
  
        count = PAGE_SIZE_PTE_COUNT(page_size);
-       pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp);
+       pte   = alloc_pte(dom, bus_addr, page_size, NULL, gfp, &updated);
  
+       ret = -ENOMEM;
        if (!pte)
-               return -ENOMEM;
+               goto out;
  
        for (i = 0; i < count; ++i)
                freelist = free_clear_pte(&pte[i], pte[i], freelist);
  
        if (freelist != NULL)
-               dom->updated = true;
+               updated = true;
  
        if (count > 1) {
                __pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
        for (i = 0; i < count; ++i)
                pte[i] = __pte;
  
-       update_domain(dom);
+       ret = 0;
+ out:
+       if (updated) {
+               unsigned long flags;
+               spin_lock_irqsave(&dom->lock, flags);
+               update_domain(dom);
+               spin_unlock_irqrestore(&dom->lock, flags);
+       }
  
        /* Everything flushed out, free pages now */
        free_page_list(freelist);
  
-       return 0;
+       return ret;
  }
  
  static unsigned long iommu_unmap_page(struct protection_domain *dom,
@@@ -1806,8 -1866,12 +1866,12 @@@ static void free_gcr3_table(struct prot
  
  static void dma_ops_domain_flush_tlb(struct dma_ops_domain *dom)
  {
+       unsigned long flags;
+       spin_lock_irqsave(&dom->domain.lock, flags);
        domain_flush_tlb(&dom->domain);
        domain_flush_complete(&dom->domain);
+       spin_unlock_irqrestore(&dom->domain.lock, flags);
  }
  
  static void iova_domain_flush_tlb(struct iova_domain *iovad)
@@@ -2022,36 -2086,6 +2086,6 @@@ static void do_detach(struct iommu_dev_
        domain->dev_cnt                 -= 1;
  }
  
- /*
-  * If a device is not yet associated with a domain, this function makes the
-  * device visible in the domain
-  */
- static int __attach_device(struct iommu_dev_data *dev_data,
-                          struct protection_domain *domain)
- {
-       int ret;
-       /* lock domain */
-       spin_lock(&domain->lock);
-       ret = -EBUSY;
-       if (dev_data->domain != NULL)
-               goto out_unlock;
-       /* Attach alias group root */
-       do_attach(dev_data, domain);
-       ret = 0;
- out_unlock:
-       /* ready */
-       spin_unlock(&domain->lock);
-       return ret;
- }
  static void pdev_iommuv2_disable(struct pci_dev *pdev)
  {
        pci_disable_ats(pdev);
@@@ -2133,19 -2167,28 +2167,28 @@@ static int attach_device(struct device 
        unsigned long flags;
        int ret;
  
+       spin_lock_irqsave(&domain->lock, flags);
        dev_data = get_dev_data(dev);
  
+       spin_lock(&dev_data->lock);
+       ret = -EBUSY;
+       if (dev_data->domain != NULL)
+               goto out;
        if (!dev_is_pci(dev))
                goto skip_ats_check;
  
        pdev = to_pci_dev(dev);
        if (domain->flags & PD_IOMMUV2_MASK) {
+               ret = -EINVAL;
                if (!dev_data->passthrough)
-                       return -EINVAL;
+                       goto out;
  
                if (dev_data->iommu_v2) {
                        if (pdev_iommuv2_enable(pdev) != 0)
-                               return -EINVAL;
+                               goto out;
  
                        dev_data->ats.enabled = true;
                        dev_data->ats.qdep    = pci_ats_queue_depth(pdev);
        }
  
  skip_ats_check:
-       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       ret = __attach_device(dev_data, domain);
-       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       ret = 0;
+       do_attach(dev_data, domain);
  
        /*
         * We might boot into a crash-kernel here. The crashed kernel
         */
        domain_flush_tlb_pde(domain);
  
-       return ret;
- }
- /*
-  * Removes a device from a protection domain (unlocked)
-  */
- static void __detach_device(struct iommu_dev_data *dev_data)
- {
-       struct protection_domain *domain;
-       domain = dev_data->domain;
+       domain_flush_complete(domain);
  
-       spin_lock(&domain->lock);
+ out:
+       spin_unlock(&dev_data->lock);
  
-       do_detach(dev_data);
+       spin_unlock_irqrestore(&domain->lock, flags);
  
-       spin_unlock(&domain->lock);
+       return ret;
  }
  
  /*
@@@ -2200,6 -2234,10 +2234,10 @@@ static void detach_device(struct devic
        dev_data = get_dev_data(dev);
        domain   = dev_data->domain;
  
+       spin_lock_irqsave(&domain->lock, flags);
+       spin_lock(&dev_data->lock);
        /*
         * First check if the device is still attached. It might already
         * be detached from its domain because the generic
         * our alias handling.
         */
        if (WARN_ON(!dev_data->domain))
-               return;
+               goto out;
  
-       /* lock device table */
-       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
-       __detach_device(dev_data);
-       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       do_detach(dev_data);
  
        if (!dev_is_pci(dev))
-               return;
+               goto out;
  
        if (domain->flags & PD_IOMMUV2_MASK && dev_data->iommu_v2)
                pdev_iommuv2_disable(to_pci_dev(dev));
                pci_disable_ats(to_pci_dev(dev));
  
        dev_data->ats.enabled = false;
+ out:
+       spin_unlock(&dev_data->lock);
+       spin_unlock_irqrestore(&domain->lock, flags);
  }
  
  static int amd_iommu_add_device(struct device *dev)
@@@ -2354,15 -2394,10 +2394,10 @@@ static void update_device_table(struct 
  
  static void update_domain(struct protection_domain *domain)
  {
-       if (!domain->updated)
-               return;
        update_device_table(domain);
  
        domain_flush_devices(domain);
        domain_flush_tlb_pde(domain);
-       domain->updated = false;
  }
  
  static int dir2prot(enum dma_data_direction direction)
@@@ -2392,6 -2427,7 +2427,7 @@@ static dma_addr_t __map_single(struct d
  {
        dma_addr_t offset = paddr & ~PAGE_MASK;
        dma_addr_t address, start, ret;
+       unsigned long flags;
        unsigned int pages;
        int prot = 0;
        int i;
@@@ -2429,8 -2465,10 +2465,10 @@@ out_unmap
                iommu_unmap_page(&dma_dom->domain, start, PAGE_SIZE);
        }
  
+       spin_lock_irqsave(&dma_dom->domain.lock, flags);
        domain_flush_tlb(&dma_dom->domain);
        domain_flush_complete(&dma_dom->domain);
+       spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
  
        dma_ops_free_iova(dma_dom, address, pages);
  
@@@ -2459,8 -2497,12 +2497,12 @@@ static void __unmap_single(struct dma_o
        }
  
        if (amd_iommu_unmap_flush) {
+               unsigned long flags;
+               spin_lock_irqsave(&dma_dom->domain.lock, flags);
                domain_flush_tlb(&dma_dom->domain);
                domain_flush_complete(&dma_dom->domain);
+               spin_unlock_irqrestore(&dma_dom->domain.lock, flags);
                dma_ops_free_iova(dma_dom, dma_addr, pages);
        } else {
                pages = __roundup_pow_of_two(pages);
@@@ -2754,8 -2796,6 +2796,8 @@@ static const struct dma_map_ops amd_iom
        .map_sg         = map_sg,
        .unmap_sg       = unmap_sg,
        .dma_supported  = amd_iommu_dma_supported,
 +      .mmap           = dma_common_mmap,
 +      .get_sgtable    = dma_common_get_sgtable,
  };
  
  static int init_reserved_iova_ranges(void)
@@@ -2866,16 -2906,16 +2908,16 @@@ static void cleanup_domain(struct prote
        struct iommu_dev_data *entry;
        unsigned long flags;
  
-       spin_lock_irqsave(&amd_iommu_devtable_lock, flags);
+       spin_lock_irqsave(&domain->lock, flags);
  
        while (!list_empty(&domain->dev_list)) {
                entry = list_first_entry(&domain->dev_list,
                                         struct iommu_dev_data, list);
                BUG_ON(!entry->domain);
-               __detach_device(entry);
+               do_detach(entry);
        }
  
-       spin_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
+       spin_unlock_irqrestore(&domain->lock, flags);
  }
  
  static void protection_domain_free(struct protection_domain *domain)
@@@ -3226,9 -3266,12 +3268,12 @@@ static bool amd_iommu_is_attach_deferre
  static void amd_iommu_flush_iotlb_all(struct iommu_domain *domain)
  {
        struct protection_domain *dom = to_pdomain(domain);
+       unsigned long flags;
  
+       spin_lock_irqsave(&dom->lock, flags);
        domain_flush_tlb_pde(dom);
        domain_flush_complete(dom);
+       spin_unlock_irqrestore(&dom->lock, flags);
  }
  
  static void amd_iommu_iotlb_sync(struct iommu_domain *domain,
@@@ -3290,7 -3333,6 +3335,6 @@@ void amd_iommu_domain_direct_map(struc
  
        /* Update data structure */
        domain->mode    = PAGE_MODE_NONE;
-       domain->updated = true;
  
        /* Make changes visible to IOMMUs */
        update_domain(domain);
@@@ -3336,7 -3378,6 +3380,6 @@@ int amd_iommu_domain_enable_v2(struct i
  
        domain->glx      = levels;
        domain->flags   |= PD_IOMMUV2_MASK;
-       domain->updated  = true;
  
        update_domain(domain);