drivers/vfio/vfio_iommu_type1.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  *
  12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  14  * VT-d, but that makes it harder to re-use as theoretically anyone
  15  * implementing a similar IOMMU could make use of this.  We expect the
  16  * IOMMU to support the IOMMU API and have few to no restrictions around
  17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  18  * optimized for relatively static mappings of a userspace process with
  19  * userpsace pages pinned into memory.  We also assume devices and IOMMU
  20  * domains are PCI based as the IOMMU API is still centered around a
  21  * device/bus interface rather than a group interface.
  22  */
  23
  24 #include <linux/compat.h>
  25 #include <linux/device.h>
  26 #include <linux/fs.h>
  27 #include <linux/highmem.h>
  28 #include <linux/iommu.h>
  29 #include <linux/module.h>
  30 #include <linux/mm.h>
  31 #include <linux/kthread.h>
  32 #include <linux/rbtree.h>
  33 #include <linux/sched/signal.h>
  34 #include <linux/sched/mm.h>
  35 #include <linux/slab.h>
  36 #include <linux/uaccess.h>
  37 #include <linux/vfio.h>
  38 #include <linux/workqueue.h>
  39 #include <linux/mdev.h>
  40 #include <linux/notifier.h>
  41 #include <linux/dma-iommu.h>
  42 #include <linux/irqdomain.h>
  43
  44 #define DRIVER_VERSION  "0.2"
  45 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  46 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  47
  48 static bool allow_unsafe_interrupts;
  49 module_param_named(allow_unsafe_interrupts,
  50                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  51 MODULE_PARM_DESC(allow_unsafe_interrupts,
  52                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  53
  54 static bool disable_hugepages;
  55 module_param_named(disable_hugepages,
  56                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  57 MODULE_PARM_DESC(disable_hugepages,
  58                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  59
  60 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
  61 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
  62 MODULE_PARM_DESC(dma_entry_limit,
  63                  "Maximum number of user DMA mappings per container (65535).");
  64
  65 struct vfio_iommu {
  66         struct list_head        domain_list;
  67         struct list_head        iova_list;
  68         struct vfio_domain      *external_domain; /* domain for external user */
  69         struct mutex            lock;
  70         struct rb_root          dma_list;
  71         struct blocking_notifier_head notifier;
  72         unsigned int            dma_avail;
  73         unsigned int            vaddr_invalid_count;
  74         uint64_t                pgsize_bitmap;
  75         uint64_t                num_non_pinned_groups;
  76         wait_queue_head_t       vaddr_wait;
  77         bool                    v2;
  78         bool                    nesting;
  79         bool                    dirty_page_tracking;
  80         bool                    pinned_page_dirty_scope;
  81         bool                    container_open;
  82 };
  83
  84 struct vfio_domain {
  85         struct iommu_domain     *domain;
  86         struct list_head        next;
  87         struct list_head        group_list;
  88         int                     prot;           /* IOMMU_CACHE */
  89         bool                    fgsp;           /* Fine-grained super pages */
  90 };
  91
  92 struct vfio_dma {
  93         struct rb_node          node;
  94         dma_addr_t              iova;           /* Device address */
  95         unsigned long           vaddr;          /* Process virtual addr */
  96         size_t                  size;           /* Map size (bytes) */
  97         int                     prot;           /* IOMMU_READ/WRITE */
  98         bool                    iommu_mapped;
  99         bool                    lock_cap;       /* capable(CAP_IPC_LOCK) */
 100         bool                    vaddr_invalid;
 101         struct task_struct      *task;
 102         struct rb_root          pfn_list;       /* Ex-user pinned pfn list */
 103         unsigned long           *bitmap;
 104 };
 105
 106 struct vfio_batch {
 107         struct page             **pages;        /* for pin_user_pages_remote */
 108         struct page             *fallback_page; /* if pages alloc fails */
 109         int                     capacity;       /* length of pages array */
 110         int                     size;           /* of batch currently */
 111         int                     offset;         /* of next entry in pages */
 112 };
 113
 114 struct vfio_group {
 115         struct iommu_group      *iommu_group;
 116         struct list_head        next;
 117         bool                    mdev_group;     /* An mdev group */
 118         bool                    pinned_page_dirty_scope;
 119 };
 120
 121 struct vfio_iova {
 122         struct list_head        list;
 123         dma_addr_t              start;
 124         dma_addr_t              end;
 125 };
 126
 127 /*
 128  * Guest RAM pinning working set or DMA target
 129  */
 130 struct vfio_pfn {
 131         struct rb_node          node;
 132         dma_addr_t              iova;           /* Device address */
 133         unsigned long           pfn;            /* Host pfn */
 134         unsigned int            ref_count;
 135 };
 136
 137 struct vfio_regions {
 138         struct list_head list;
 139         dma_addr_t iova;
 140         phys_addr_t phys;
 141         size_t len;
 142 };
 143
 144 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
 145                                         (!list_empty(&iommu->domain_list))
 146
 147 #define DIRTY_BITMAP_BYTES(n)   (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
 148
 149 /*
 150  * Input argument of number of bits to bitmap_set() is unsigned integer, which
 151  * further casts to signed integer for unaligned multi-bit operation,
 152  * __bitmap_set().
 153  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
 154  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
 155  * system.
 156  */
 157 #define DIRTY_BITMAP_PAGES_MAX   ((u64)INT_MAX)
 158 #define DIRTY_BITMAP_SIZE_MAX    DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 159
 160 #define WAITED 1
 161
 162 static int put_pfn(unsigned long pfn, int prot);
 163
 164 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 165                                                struct iommu_group *iommu_group);
 166
 167 /*
 168  * This code handles mapping and unmapping of user data buffers
 169  * into DMA'ble space using the IOMMU
 170  */
 171
 172 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 173                                       dma_addr_t start, size_t size)
 174 {
 175         struct rb_node *node = iommu->dma_list.rb_node;
 176
 177         while (node) {
 178                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 179
 180                 if (start + size <= dma->iova)
 181                         node = node->rb_left;
 182                 else if (start >= dma->iova + dma->size)
 183                         node = node->rb_right;
 184                 else
 185                         return dma;
 186         }
 187
 188         return NULL;
 189 }
 190
 191 static struct rb_node *vfio_find_dma_first_node(struct vfio_iommu *iommu,
 192                                                 dma_addr_t start, u64 size)
 193 {
 194         struct rb_node *res = NULL;
 195         struct rb_node *node = iommu->dma_list.rb_node;
 196         struct vfio_dma *dma_res = NULL;
 197
 198         while (node) {
 199                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 200
 201                 if (start < dma->iova + dma->size) {
 202                         res = node;
 203                         dma_res = dma;
 204                         if (start >= dma->iova)
 205                                 break;
 206                         node = node->rb_left;
 207                 } else {
 208                         node = node->rb_right;
 209                 }
 210         }
 211         if (res && size && dma_res->iova >= start + size)
 212                 res = NULL;
 213         return res;
 214 }
 215
 216 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 217 {
 218         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 219         struct vfio_dma *dma;
 220
 221         while (*link) {
 222                 parent = *link;
 223                 dma = rb_entry(parent, struct vfio_dma, node);
 224
 225                 if (new->iova + new->size <= dma->iova)
 226                         link = &(*link)->rb_left;
 227                 else
 228                         link = &(*link)->rb_right;
 229         }
 230
 231         rb_link_node(&new->node, parent, link);
 232         rb_insert_color(&new->node, &iommu->dma_list);
 233 }
 234
 235 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 236 {
 237         rb_erase(&old->node, &iommu->dma_list);
 238 }
 239
 240
 241 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 242 {
 243         uint64_t npages = dma->size / pgsize;
 244
 245         if (npages > DIRTY_BITMAP_PAGES_MAX)
 246                 return -EINVAL;
 247
 248         /*
 249          * Allocate extra 64 bits that are used to calculate shift required for
 250          * bitmap_shift_left() to manipulate and club unaligned number of pages
 251          * in adjacent vfio_dma ranges.
 252          */
 253         dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
 254                                GFP_KERNEL);
 255         if (!dma->bitmap)
 256                 return -ENOMEM;
 257
 258         return 0;
 259 }
 260
 261 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
 262 {
 263         kfree(dma->bitmap);
 264         dma->bitmap = NULL;
 265 }
 266
 267 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 268 {
 269         struct rb_node *p;
 270         unsigned long pgshift = __ffs(pgsize);
 271
 272         for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
 273                 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
 274
 275                 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
 276         }
 277 }
 278
 279 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
 280 {
 281         struct rb_node *n;
 282         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 283
 284         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 285                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 286
 287                 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
 288         }
 289 }
 290
 291 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
 292 {
 293         struct rb_node *n;
 294
 295         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 296                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 297                 int ret;
 298
 299                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
 300                 if (ret) {
 301                         struct rb_node *p;
 302
 303                         for (p = rb_prev(n); p; p = rb_prev(p)) {
 304                                 struct vfio_dma *dma = rb_entry(n,
 305                                                         struct vfio_dma, node);
 306
 307                                 vfio_dma_bitmap_free(dma);
 308                         }
 309                         return ret;
 310                 }
 311                 vfio_dma_populate_bitmap(dma, pgsize);
 312         }
 313         return 0;
 314 }
 315
 316 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
 317 {
 318         struct rb_node *n;
 319
 320         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 321                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 322
 323                 vfio_dma_bitmap_free(dma);
 324         }
 325 }
 326
 327 /*
 328  * Helper Functions for host iova-pfn list
 329  */
 330 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
 331 {
 332         struct vfio_pfn *vpfn;
 333         struct rb_node *node = dma->pfn_list.rb_node;
 334
 335         while (node) {
 336                 vpfn = rb_entry(node, struct vfio_pfn, node);
 337
 338                 if (iova < vpfn->iova)
 339                         node = node->rb_left;
 340                 else if (iova > vpfn->iova)
 341                         node = node->rb_right;
 342                 else
 343                         return vpfn;
 344         }
 345         return NULL;
 346 }
 347
 348 static void vfio_link_pfn(struct vfio_dma *dma,
 349                           struct vfio_pfn *new)
 350 {
 351         struct rb_node **link, *parent = NULL;
 352         struct vfio_pfn *vpfn;
 353
 354         link = &dma->pfn_list.rb_node;
 355         while (*link) {
 356                 parent = *link;
 357                 vpfn = rb_entry(parent, struct vfio_pfn, node);
 358
 359                 if (new->iova < vpfn->iova)
 360                         link = &(*link)->rb_left;
 361                 else
 362                         link = &(*link)->rb_right;
 363         }
 364
 365         rb_link_node(&new->node, parent, link);
 366         rb_insert_color(&new->node, &dma->pfn_list);
 367 }
 368
 369 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
 370 {
 371         rb_erase(&old->node, &dma->pfn_list);
 372 }
 373
 374 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 375                                 unsigned long pfn)
 376 {
 377         struct vfio_pfn *vpfn;
 378
 379         vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
 380         if (!vpfn)
 381                 return -ENOMEM;
 382
 383         vpfn->iova = iova;
 384         vpfn->pfn = pfn;
 385         vpfn->ref_count = 1;
 386         vfio_link_pfn(dma, vpfn);
 387         return 0;
 388 }
 389
 390 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
 391                                       struct vfio_pfn *vpfn)
 392 {
 393         vfio_unlink_pfn(dma, vpfn);
 394         kfree(vpfn);
 395 }
 396
 397 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 398                                                unsigned long iova)
 399 {
 400         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 401
 402         if (vpfn)
 403                 vpfn->ref_count++;
 404         return vpfn;
 405 }
 406
 407 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 408 {
 409         int ret = 0;
 410
 411         vpfn->ref_count--;
 412         if (!vpfn->ref_count) {
 413                 ret = put_pfn(vpfn->pfn, dma->prot);
 414                 vfio_remove_from_pfn_list(dma, vpfn);
 415         }
 416         return ret;
 417 }
 418
 419 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 420 {
 421         struct mm_struct *mm;
 422         int ret;
 423
 424         if (!npage)
 425                 return 0;
 426
 427         mm = async ? get_task_mm(dma->task) : dma->task->mm;
 428         if (!mm)
 429                 return -ESRCH; /* process exited */
 430
 431         ret = mmap_write_lock_killable(mm);
 432         if (!ret) {
 433                 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
 434                                           dma->lock_cap);
 435                 mmap_write_unlock(mm);
 436         }
 437
 438         if (async)
 439                 mmput(mm);
 440
 441         return ret;
 442 }
 443
 444 /*
 445  * Some mappings aren't backed by a struct page, for example an mmap'd
 446  * MMIO range for our own or another device.  These use a different
 447  * pfn conversion and shouldn't be tracked as locked pages.
 448  * For compound pages, any driver that sets the reserved bit in head
 449  * page needs to set the reserved bit in all subpages to be safe.
 450  */
 451 static bool is_invalid_reserved_pfn(unsigned long pfn)
 452 {
 453         if (pfn_valid(pfn))
 454                 return PageReserved(pfn_to_page(pfn));
 455
 456         return true;
 457 }
 458
 459 static int put_pfn(unsigned long pfn, int prot)
 460 {
 461         if (!is_invalid_reserved_pfn(pfn)) {
 462                 struct page *page = pfn_to_page(pfn);
 463
 464                 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 465                 return 1;
 466         }
 467         return 0;
 468 }
 469
 470 #define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
 471
 472 static void vfio_batch_init(struct vfio_batch *batch)
 473 {
 474         batch->size = 0;
 475         batch->offset = 0;
 476
 477         if (unlikely(disable_hugepages))
 478                 goto fallback;
 479
 480         batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
 481         if (!batch->pages)
 482                 goto fallback;
 483
 484         batch->capacity = VFIO_BATCH_MAX_CAPACITY;
 485         return;
 486
 487 fallback:
 488         batch->pages = &batch->fallback_page;
 489         batch->capacity = 1;
 490 }
 491
 492 static void vfio_batch_unpin(struct vfio_batch *batch, struct vfio_dma *dma)
 493 {
 494         while (batch->size) {
 495                 unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
 496
 497                 put_pfn(pfn, dma->prot);
 498                 batch->offset++;
 499                 batch->size--;
 500         }
 501 }
 502
 503 static void vfio_batch_fini(struct vfio_batch *batch)
 504 {
 505         if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
 506                 free_page((unsigned long)batch->pages);
 507 }
 508
 509 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 510                             unsigned long vaddr, unsigned long *pfn,
 511                             bool write_fault)
 512 {
 513         pte_t *ptep;
 514         spinlock_t *ptl;
 515         int ret;
 516
 517         ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
 518         if (ret) {
 519                 bool unlocked = false;
 520
 521                 ret = fixup_user_fault(mm, vaddr,
 522                                        FAULT_FLAG_REMOTE |
 523                                        (write_fault ?  FAULT_FLAG_WRITE : 0),
 524                                        &unlocked);
 525                 if (unlocked)
 526                         return -EAGAIN;
 527
 528                 if (ret)
 529                         return ret;
 530
 531                 ret = follow_pte(vma->vm_mm, vaddr, &ptep, &ptl);
 532                 if (ret)
 533                         return ret;
 534         }
 535
 536         if (write_fault && !pte_write(*ptep))
 537                 ret = -EFAULT;
 538         else
 539                 *pfn = pte_pfn(*ptep);
 540
 541         pte_unmap_unlock(ptep, ptl);
 542         return ret;
 543 }
 544
 545 /*
 546  * Returns the positive number of pfns successfully obtained or a negative
 547  * error code.
 548  */
 549 static int vaddr_get_pfns(struct mm_struct *mm, unsigned long vaddr,
 550                           long npages, int prot, unsigned long *pfn,
 551                           struct page **pages)
 552 {
 553         struct vm_area_struct *vma;
 554         unsigned int flags = 0;
 555         int ret;
 556
 557         if (prot & IOMMU_WRITE)
 558                 flags |= FOLL_WRITE;
 559
 560         mmap_read_lock(mm);
 561         ret = pin_user_pages_remote(mm, vaddr, npages, flags | FOLL_LONGTERM,
 562                                     pages, NULL, NULL);
 563         if (ret > 0) {
 564                 *pfn = page_to_pfn(pages[0]);
 565                 goto done;
 566         }
 567
 568         vaddr = untagged_addr(vaddr);
 569
 570 retry:
 571         vma = find_vma_intersection(mm, vaddr, vaddr + 1);
 572
 573         if (vma && vma->vm_flags & VM_PFNMAP) {
 574                 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
 575                 if (ret == -EAGAIN)
 576                         goto retry;
 577
 578                 if (!ret) {
 579                         if (is_invalid_reserved_pfn(*pfn))
 580                                 ret = 1;
 581                         else
 582                                 ret = -EFAULT;
 583                 }
 584         }
 585 done:
 586         mmap_read_unlock(mm);
 587         return ret;
 588 }
 589
 590 static int vfio_wait(struct vfio_iommu *iommu)
 591 {
 592         DEFINE_WAIT(wait);
 593
 594         prepare_to_wait(&iommu->vaddr_wait, &wait, TASK_KILLABLE);
 595         mutex_unlock(&iommu->lock);
 596         schedule();
 597         mutex_lock(&iommu->lock);
 598         finish_wait(&iommu->vaddr_wait, &wait);
 599         if (kthread_should_stop() || !iommu->container_open ||
 600             fatal_signal_pending(current)) {
 601                 return -EFAULT;
 602         }
 603         return WAITED;
 604 }
 605
 606 /*
 607  * Find dma struct and wait for its vaddr to be valid.  iommu lock is dropped
 608  * if the task waits, but is re-locked on return.  Return result in *dma_p.
 609  * Return 0 on success with no waiting, WAITED on success if waited, and -errno
 610  * on error.
 611  */
 612 static int vfio_find_dma_valid(struct vfio_iommu *iommu, dma_addr_t start,
 613                                size_t size, struct vfio_dma **dma_p)
 614 {
 615         int ret;
 616
 617         do {
 618                 *dma_p = vfio_find_dma(iommu, start, size);
 619                 if (!*dma_p)
 620                         ret = -EINVAL;
 621                 else if (!(*dma_p)->vaddr_invalid)
 622                         ret = 0;
 623                 else
 624                         ret = vfio_wait(iommu);
 625         } while (ret > 0);
 626
 627         return ret;
 628 }
 629
 630 /*
 631  * Wait for all vaddr in the dma_list to become valid.  iommu lock is dropped
 632  * if the task waits, but is re-locked on return.  Return 0 on success with no
 633  * waiting, WAITED on success if waited, and -errno on error.
 634  */
 635 static int vfio_wait_all_valid(struct vfio_iommu *iommu)
 636 {
 637         int ret = 0;
 638
 639         while (iommu->vaddr_invalid_count && ret >= 0)
 640                 ret = vfio_wait(iommu);
 641
 642         return ret;
 643 }
 644
 645 /*
 646  * Attempt to pin pages.  We really don't want to track all the pfns and
 647  * the iommu can only map chunks of consecutive pfns anyway, so get the
 648  * first page and all consecutive pages with the same locking.
 649  */
 650 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 651                                   long npage, unsigned long *pfn_base,
 652                                   unsigned long limit, struct vfio_batch *batch)
 653 {
 654         unsigned long pfn;
 655         struct mm_struct *mm = current->mm;
 656         long ret, pinned = 0, lock_acct = 0;
 657         bool rsvd;
 658         dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 659
 660         /* This code path is only user initiated */
 661         if (!mm)
 662                 return -ENODEV;
 663
 664         if (batch->size) {
 665                 /* Leftover pages in batch from an earlier call. */
 666                 *pfn_base = page_to_pfn(batch->pages[batch->offset]);
 667                 pfn = *pfn_base;
 668                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 669         } else {
 670                 *pfn_base = 0;
 671         }
 672
 673         while (npage) {
 674                 if (!batch->size) {
 675                         /* Empty batch, so refill it. */
 676                         long req_pages = min_t(long, npage, batch->capacity);
 677
 678                         ret = vaddr_get_pfns(mm, vaddr, req_pages, dma->prot,
 679                                              &pfn, batch->pages);
 680                         if (ret < 0)
 681                                 goto unpin_out;
 682
 683                         batch->size = ret;
 684                         batch->offset = 0;
 685
 686                         if (!*pfn_base) {
 687                                 *pfn_base = pfn;
 688                                 rsvd = is_invalid_reserved_pfn(*pfn_base);
 689                         }
 690                 }
 691
 692                 /*
 693                  * pfn is preset for the first iteration of this inner loop and
 694                  * updated at the end to handle a VM_PFNMAP pfn.  In that case,
 695                  * batch->pages isn't valid (there's no struct page), so allow
 696                  * batch->pages to be touched only when there's more than one
 697                  * pfn to check, which guarantees the pfns are from a
 698                  * !VM_PFNMAP vma.
 699                  */
 700                 while (true) {
 701                         if (pfn != *pfn_base + pinned ||
 702                             rsvd != is_invalid_reserved_pfn(pfn))
 703                                 goto out;
 704
 705                         /*
 706                          * Reserved pages aren't counted against the user,
 707                          * externally pinned pages are already counted against
 708                          * the user.
 709                          */
 710                         if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 711                                 if (!dma->lock_cap &&
 712                                     mm->locked_vm + lock_acct + 1 > limit) {
 713                                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 714                                                 __func__, limit << PAGE_SHIFT);
 715                                         ret = -ENOMEM;
 716                                         goto unpin_out;
 717                                 }
 718                                 lock_acct++;
 719                         }
 720
 721                         pinned++;
 722                         npage--;
 723                         vaddr += PAGE_SIZE;
 724                         iova += PAGE_SIZE;
 725                         batch->offset++;
 726                         batch->size--;
 727
 728                         if (!batch->size)
 729                                 break;
 730
 731                         pfn = page_to_pfn(batch->pages[batch->offset]);
 732                 }
 733
 734                 if (unlikely(disable_hugepages))
 735                         break;
 736         }
 737
 738 out:
 739         ret = vfio_lock_acct(dma, lock_acct, false);
 740
 741 unpin_out:
 742         if (ret < 0) {
 743                 if (pinned && !rsvd) {
 744                         for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 745                                 put_pfn(pfn, dma->prot);
 746                 }
 747                 vfio_batch_unpin(batch, dma);
 748
 749                 return ret;
 750         }
 751
 752         return pinned;
 753 }
 754
 755 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 756                                     unsigned long pfn, long npage,
 757                                     bool do_accounting)
 758 {
 759         long unlocked = 0, locked = 0;
 760         long i;
 761
 762         for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
 763                 if (put_pfn(pfn++, dma->prot)) {
 764                         unlocked++;
 765                         if (vfio_find_vpfn(dma, iova))
 766                                 locked++;
 767                 }
 768         }
 769
 770         if (do_accounting)
 771                 vfio_lock_acct(dma, locked - unlocked, true);
 772
 773         return unlocked;
 774 }
 775
 776 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 777                                   unsigned long *pfn_base, bool do_accounting)
 778 {
 779         struct page *pages[1];
 780         struct mm_struct *mm;
 781         int ret;
 782
 783         mm = get_task_mm(dma->task);
 784         if (!mm)
 785                 return -ENODEV;
 786
 787         ret = vaddr_get_pfns(mm, vaddr, 1, dma->prot, pfn_base, pages);
 788         if (ret != 1)
 789                 goto out;
 790
 791         ret = 0;
 792
 793         if (do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 794                 ret = vfio_lock_acct(dma, 1, true);
 795                 if (ret) {
 796                         put_pfn(*pfn_base, dma->prot);
 797                         if (ret == -ENOMEM)
 798                                 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
 799                                         "(%ld) exceeded\n", __func__,
 800                                         dma->task->comm, task_pid_nr(dma->task),
 801                                         task_rlimit(dma->task, RLIMIT_MEMLOCK));
 802                 }
 803         }
 804
 805 out:
 806         mmput(mm);
 807         return ret;
 808 }
 809
 810 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 811                                     bool do_accounting)
 812 {
 813         int unlocked;
 814         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 815
 816         if (!vpfn)
 817                 return 0;
 818
 819         unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 820
 821         if (do_accounting)
 822                 vfio_lock_acct(dma, -unlocked, true);
 823
 824         return unlocked;
 825 }
 826
 827 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 828                                       struct iommu_group *iommu_group,
 829                                       unsigned long *user_pfn,
 830                                       int npage, int prot,
 831                                       unsigned long *phys_pfn)
 832 {
 833         struct vfio_iommu *iommu = iommu_data;
 834         struct vfio_group *group;
 835         int i, j, ret;
 836         unsigned long remote_vaddr;
 837         struct vfio_dma *dma;
 838         bool do_accounting;
 839         dma_addr_t iova;
 840
 841         if (!iommu || !user_pfn || !phys_pfn)
 842                 return -EINVAL;
 843
 844         /* Supported for v2 version only */
 845         if (!iommu->v2)
 846                 return -EACCES;
 847
 848         mutex_lock(&iommu->lock);
 849
 850         /*
 851          * Wait for all necessary vaddr's to be valid so they can be used in
 852          * the main loop without dropping the lock, to avoid racing vs unmap.
 853          */
 854 again:
 855         if (iommu->vaddr_invalid_count) {
 856                 for (i = 0; i < npage; i++) {
 857                         iova = user_pfn[i] << PAGE_SHIFT;
 858                         ret = vfio_find_dma_valid(iommu, iova, PAGE_SIZE, &dma);
 859                         if (ret < 0)
 860                                 goto pin_done;
 861                         if (ret == WAITED)
 862                                 goto again;
 863                 }
 864         }
 865
 866         /* Fail if notifier list is empty */
 867         if (!iommu->notifier.head) {
 868                 ret = -EINVAL;
 869                 goto pin_done;
 870         }
 871
 872         /*
 873          * If iommu capable domain exist in the container then all pages are
 874          * already pinned and accounted. Accouting should be done if there is no
 875          * iommu capable domain in the container.
 876          */
 877         do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 878
 879         for (i = 0; i < npage; i++) {
 880                 struct vfio_pfn *vpfn;
 881
 882                 iova = user_pfn[i] << PAGE_SHIFT;
 883                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 884                 if (!dma) {
 885                         ret = -EINVAL;
 886                         goto pin_unwind;
 887                 }
 888
 889                 if ((dma->prot & prot) != prot) {
 890                         ret = -EPERM;
 891                         goto pin_unwind;
 892                 }
 893
 894                 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 895                 if (vpfn) {
 896                         phys_pfn[i] = vpfn->pfn;
 897                         continue;
 898                 }
 899
 900                 remote_vaddr = dma->vaddr + (iova - dma->iova);
 901                 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
 902                                              do_accounting);
 903                 if (ret)
 904                         goto pin_unwind;
 905
 906                 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
 907                 if (ret) {
 908                         if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
 909                                 vfio_lock_acct(dma, -1, true);
 910                         goto pin_unwind;
 911                 }
 912
 913                 if (iommu->dirty_page_tracking) {
 914                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 915
 916                         /*
 917                          * Bitmap populated with the smallest supported page
 918                          * size
 919                          */
 920                         bitmap_set(dma->bitmap,
 921                                    (iova - dma->iova) >> pgshift, 1);
 922                 }
 923         }
 924         ret = i;
 925
 926         group = vfio_iommu_find_iommu_group(iommu, iommu_group);
 927         if (!group->pinned_page_dirty_scope) {
 928                 group->pinned_page_dirty_scope = true;
 929                 iommu->num_non_pinned_groups--;
 930         }
 931
 932         goto pin_done;
 933
 934 pin_unwind:
 935         phys_pfn[i] = 0;
 936         for (j = 0; j < i; j++) {
 937                 dma_addr_t iova;
 938
 939                 iova = user_pfn[j] << PAGE_SHIFT;
 940                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 941                 vfio_unpin_page_external(dma, iova, do_accounting);
 942                 phys_pfn[j] = 0;
 943         }
 944 pin_done:
 945         mutex_unlock(&iommu->lock);
 946         return ret;
 947 }
 948
 949 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 950                                         unsigned long *user_pfn,
 951                                         int npage)
 952 {
 953         struct vfio_iommu *iommu = iommu_data;
 954         bool do_accounting;
 955         int i;
 956
 957         if (!iommu || !user_pfn)
 958                 return -EINVAL;
 959
 960         /* Supported for v2 version only */
 961         if (!iommu->v2)
 962                 return -EACCES;
 963
 964         mutex_lock(&iommu->lock);
 965
 966         do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 967         for (i = 0; i < npage; i++) {
 968                 struct vfio_dma *dma;
 969                 dma_addr_t iova;
 970
 971                 iova = user_pfn[i] << PAGE_SHIFT;
 972                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 973                 if (!dma)
 974                         goto unpin_exit;
 975                 vfio_unpin_page_external(dma, iova, do_accounting);
 976         }
 977
 978 unpin_exit:
 979         mutex_unlock(&iommu->lock);
 980         return i > npage ? npage : (i > 0 ? i : -EINVAL);
 981 }
 982
 983 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 984                             struct list_head *regions,
 985                             struct iommu_iotlb_gather *iotlb_gather)
 986 {
 987         long unlocked = 0;
 988         struct vfio_regions *entry, *next;
 989
 990         iommu_iotlb_sync(domain->domain, iotlb_gather);
 991
 992         list_for_each_entry_safe(entry, next, regions, list) {
 993                 unlocked += vfio_unpin_pages_remote(dma,
 994                                                     entry->iova,
 995                                                     entry->phys >> PAGE_SHIFT,
 996                                                     entry->len >> PAGE_SHIFT,
 997                                                     false);
 998                 list_del(&entry->list);
 999                 kfree(entry);
1000         }
1001
1002         cond_resched();
1003
1004         return unlocked;
1005 }
1006
1007 /*
1008  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
1009  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
1010  * of these regions (currently using a list).
1011  *
1012  * This value specifies maximum number of regions for each IOTLB flush sync.
1013  */
1014 #define VFIO_IOMMU_TLB_SYNC_MAX         512
1015
1016 static size_t unmap_unpin_fast(struct vfio_domain *domain,
1017                                struct vfio_dma *dma, dma_addr_t *iova,
1018                                size_t len, phys_addr_t phys, long *unlocked,
1019                                struct list_head *unmapped_list,
1020                                int *unmapped_cnt,
1021                                struct iommu_iotlb_gather *iotlb_gather)
1022 {
1023         size_t unmapped = 0;
1024         struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
1025
1026         if (entry) {
1027                 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
1028                                             iotlb_gather);
1029
1030                 if (!unmapped) {
1031                         kfree(entry);
1032                 } else {
1033                         entry->iova = *iova;
1034                         entry->phys = phys;
1035                         entry->len  = unmapped;
1036                         list_add_tail(&entry->list, unmapped_list);
1037
1038                         *iova += unmapped;
1039                         (*unmapped_cnt)++;
1040                 }
1041         }
1042
1043         /*
1044          * Sync if the number of fast-unmap regions hits the limit
1045          * or in case of errors.
1046          */
1047         if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
1048                 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
1049                                              iotlb_gather);
1050                 *unmapped_cnt = 0;
1051         }
1052
1053         return unmapped;
1054 }
1055
1056 static size_t unmap_unpin_slow(struct vfio_domain *domain,
1057                                struct vfio_dma *dma, dma_addr_t *iova,
1058                                size_t len, phys_addr_t phys,
1059                                long *unlocked)
1060 {
1061         size_t unmapped = iommu_unmap(domain->domain, *iova, len);
1062
1063         if (unmapped) {
1064                 *unlocked += vfio_unpin_pages_remote(dma, *iova,
1065                                                      phys >> PAGE_SHIFT,
1066                                                      unmapped >> PAGE_SHIFT,
1067                                                      false);
1068                 *iova += unmapped;
1069                 cond_resched();
1070         }
1071         return unmapped;
1072 }
1073
1074 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
1075                              bool do_accounting)
1076 {
1077         dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1078         struct vfio_domain *domain, *d;
1079         LIST_HEAD(unmapped_region_list);
1080         struct iommu_iotlb_gather iotlb_gather;
1081         int unmapped_region_cnt = 0;
1082         long unlocked = 0;
1083
1084         if (!dma->size)
1085                 return 0;
1086
1087         if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1088                 return 0;
1089
1090         /*
1091          * We use the IOMMU to track the physical addresses, otherwise we'd
1092          * need a much more complicated tracking system.  Unfortunately that
1093          * means we need to use one of the iommu domains to figure out the
1094          * pfns to unpin.  The rest need to be unmapped in advance so we have
1095          * no iommu translations remaining when the pages are unpinned.
1096          */
1097         domain = d = list_first_entry(&iommu->domain_list,
1098                                       struct vfio_domain, next);
1099
1100         list_for_each_entry_continue(d, &iommu->domain_list, next) {
1101                 iommu_unmap(d->domain, dma->iova, dma->size);
1102                 cond_resched();
1103         }
1104
1105         iommu_iotlb_gather_init(&iotlb_gather);
1106         while (iova < end) {
1107                 size_t unmapped, len;
1108                 phys_addr_t phys, next;
1109
1110                 phys = iommu_iova_to_phys(domain->domain, iova);
1111                 if (WARN_ON(!phys)) {
1112                         iova += PAGE_SIZE;
1113                         continue;
1114                 }
1115
1116                 /*
1117                  * To optimize for fewer iommu_unmap() calls, each of which
1118                  * may require hardware cache flushing, try to find the
1119                  * largest contiguous physical memory chunk to unmap.
1120                  */
1121                 for (len = PAGE_SIZE;
1122                      !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1123                         next = iommu_iova_to_phys(domain->domain, iova + len);
1124                         if (next != phys + len)
1125                                 break;
1126                 }
1127
1128                 /*
1129                  * First, try to use fast unmap/unpin. In case of failure,
1130                  * switch to slow unmap/unpin path.
1131                  */
1132                 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
1133                                             &unlocked, &unmapped_region_list,
1134                                             &unmapped_region_cnt,
1135                                             &iotlb_gather);
1136                 if (!unmapped) {
1137                         unmapped = unmap_unpin_slow(domain, dma, &iova, len,
1138                                                     phys, &unlocked);
1139                         if (WARN_ON(!unmapped))
1140                                 break;
1141                 }
1142         }
1143
1144         dma->iommu_mapped = false;
1145
1146         if (unmapped_region_cnt) {
1147                 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
1148                                             &iotlb_gather);
1149         }
1150
1151         if (do_accounting) {
1152                 vfio_lock_acct(dma, -unlocked, true);
1153                 return 0;
1154         }
1155         return unlocked;
1156 }
1157
1158 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
1159 {
1160         WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1161         vfio_unmap_unpin(iommu, dma, true);
1162         vfio_unlink_dma(iommu, dma);
1163         put_task_struct(dma->task);
1164         vfio_dma_bitmap_free(dma);
1165         if (dma->vaddr_invalid) {
1166                 iommu->vaddr_invalid_count--;
1167                 wake_up_all(&iommu->vaddr_wait);
1168         }
1169         kfree(dma);
1170         iommu->dma_avail++;
1171 }
1172
1173 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1174 {
1175         struct vfio_domain *domain;
1176
1177         iommu->pgsize_bitmap = ULONG_MAX;
1178
1179         list_for_each_entry(domain, &iommu->domain_list, next)
1180                 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1181
1182         /*
1183          * In case the IOMMU supports page sizes smaller than PAGE_SIZE
1184          * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1185          * That way the user will be able to map/unmap buffers whose size/
1186          * start address is aligned with PAGE_SIZE. Pinning code uses that
1187          * granularity while iommu driver can use the sub-PAGE_SIZE size
1188          * to map the buffer.
1189          */
1190         if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1191                 iommu->pgsize_bitmap &= PAGE_MASK;
1192                 iommu->pgsize_bitmap |= PAGE_SIZE;
1193         }
1194 }
1195
1196 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1197                               struct vfio_dma *dma, dma_addr_t base_iova,
1198                               size_t pgsize)
1199 {
1200         unsigned long pgshift = __ffs(pgsize);
1201         unsigned long nbits = dma->size >> pgshift;
1202         unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1203         unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1204         unsigned long shift = bit_offset % BITS_PER_LONG;
1205         unsigned long leftover;
1206
1207         /*
1208          * mark all pages dirty if any IOMMU capable device is not able
1209          * to report dirty pages and all pages are pinned and mapped.
1210          */
1211         if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1212                 bitmap_set(dma->bitmap, 0, nbits);
1213
1214         if (shift) {
1215                 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1216                                   nbits + shift);
1217
1218                 if (copy_from_user(&leftover,
1219                                    (void __user *)(bitmap + copy_offset),
1220                                    sizeof(leftover)))
1221                         return -EFAULT;
1222
1223                 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1224         }
1225
1226         if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1227                          DIRTY_BITMAP_BYTES(nbits + shift)))
1228                 return -EFAULT;
1229
1230         return 0;
1231 }
1232
1233 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1234                                   dma_addr_t iova, size_t size, size_t pgsize)
1235 {
1236         struct vfio_dma *dma;
1237         struct rb_node *n;
1238         unsigned long pgshift = __ffs(pgsize);
1239         int ret;
1240
1241         /*
1242          * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1243          * vfio_dma mappings may be clubbed by specifying large ranges, but
1244          * there must not be any previous mappings bisected by the range.
1245          * An error will be returned if these conditions are not met.
1246          */
1247         dma = vfio_find_dma(iommu, iova, 1);
1248         if (dma && dma->iova != iova)
1249                 return -EINVAL;
1250
1251         dma = vfio_find_dma(iommu, iova + size - 1, 0);
1252         if (dma && dma->iova + dma->size != iova + size)
1253                 return -EINVAL;
1254
1255         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1256                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1257
1258                 if (dma->iova < iova)
1259                         continue;
1260
1261                 if (dma->iova > iova + size - 1)
1262                         break;
1263
1264                 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1265                 if (ret)
1266                         return ret;
1267
1268                 /*
1269                  * Re-populate bitmap to include all pinned pages which are
1270                  * considered as dirty but exclude pages which are unpinned and
1271                  * pages which are marked dirty by vfio_dma_rw()
1272                  */
1273                 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1274                 vfio_dma_populate_bitmap(dma, pgsize);
1275         }
1276         return 0;
1277 }
1278
1279 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1280 {
1281         if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1282             (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1283                 return -EINVAL;
1284
1285         return 0;
1286 }
1287
1288 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1289                              struct vfio_iommu_type1_dma_unmap *unmap,
1290                              struct vfio_bitmap *bitmap)
1291 {
1292         struct vfio_dma *dma, *dma_last = NULL;
1293         size_t unmapped = 0, pgsize;
1294         int ret = -EINVAL, retries = 0;
1295         unsigned long pgshift;
1296         dma_addr_t iova = unmap->iova;
1297         u64 size = unmap->size;
1298         bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1299         bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1300         struct rb_node *n, *first_n;
1301
1302         mutex_lock(&iommu->lock);
1303
1304         pgshift = __ffs(iommu->pgsize_bitmap);
1305         pgsize = (size_t)1 << pgshift;
1306
1307         if (iova & (pgsize - 1))
1308                 goto unlock;
1309
1310         if (unmap_all) {
1311                 if (iova || size)
1312                         goto unlock;
1313                 size = U64_MAX;
1314         } else if (!size || size & (pgsize - 1) ||
1315                    iova + size - 1 < iova || size > SIZE_MAX) {
1316                 goto unlock;
1317         }
1318
1319         /* When dirty tracking is enabled, allow only min supported pgsize */
1320         if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1321             (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1322                 goto unlock;
1323         }
1324
1325         WARN_ON((pgsize - 1) & PAGE_MASK);
1326 again:
1327         /*
1328          * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1329          * avoid tracking individual mappings.  This means that the granularity
1330          * of the original mapping was lost and the user was allowed to attempt
1331          * to unmap any range.  Depending on the contiguousness of physical
1332          * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1333          * or may not have worked.  We only guaranteed unmap granularity
1334          * matching the original mapping; even though it was untracked here,
1335          * the original mappings are reflected in IOMMU mappings.  This
1336          * resulted in a couple unusual behaviors.  First, if a range is not
1337          * able to be unmapped, ex. a set of 4k pages that was mapped as a
1338          * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1339          * a zero sized unmap.  Also, if an unmap request overlaps the first
1340          * address of a hugepage, the IOMMU will unmap the entire hugepage.
1341          * This also returns success and the returned unmap size reflects the
1342          * actual size unmapped.
1343          *
1344          * We attempt to maintain compatibility with this "v1" interface, but
1345          * we take control out of the hands of the IOMMU.  Therefore, an unmap
1346          * request offset from the beginning of the original mapping will
1347          * return success with zero sized unmap.  And an unmap request covering
1348          * the first iova of mapping will unmap the entire range.
1349          *
1350          * The v2 version of this interface intends to be more deterministic.
1351          * Unmap requests must fully cover previous mappings.  Multiple
1352          * mappings may still be unmaped by specifying large ranges, but there
1353          * must not be any previous mappings bisected by the range.  An error
1354          * will be returned if these conditions are not met.  The v2 interface
1355          * will only return success and a size of zero if there were no
1356          * mappings within the range.
1357          */
1358         if (iommu->v2 && !unmap_all) {
1359                 dma = vfio_find_dma(iommu, iova, 1);
1360                 if (dma && dma->iova != iova)
1361                         goto unlock;
1362
1363                 dma = vfio_find_dma(iommu, iova + size - 1, 0);
1364                 if (dma && dma->iova + dma->size != iova + size)
1365                         goto unlock;
1366         }
1367
1368         ret = 0;
1369         n = first_n = vfio_find_dma_first_node(iommu, iova, size);
1370
1371         while (n) {
1372                 dma = rb_entry(n, struct vfio_dma, node);
1373                 if (dma->iova >= iova + size)
1374                         break;
1375
1376                 if (!iommu->v2 && iova > dma->iova)
1377                         break;
1378                 /*
1379                  * Task with same address space who mapped this iova range is
1380                  * allowed to unmap the iova range.
1381                  */
1382                 if (dma->task->mm != current->mm)
1383                         break;
1384
1385                 if (invalidate_vaddr) {
1386                         if (dma->vaddr_invalid) {
1387                                 struct rb_node *last_n = n;
1388
1389                                 for (n = first_n; n != last_n; n = rb_next(n)) {
1390                                         dma = rb_entry(n,
1391                                                        struct vfio_dma, node);
1392                                         dma->vaddr_invalid = false;
1393                                         iommu->vaddr_invalid_count--;
1394                                 }
1395                                 ret = -EINVAL;
1396                                 unmapped = 0;
1397                                 break;
1398                         }
1399                         dma->vaddr_invalid = true;
1400                         iommu->vaddr_invalid_count++;
1401                         unmapped += dma->size;
1402                         n = rb_next(n);
1403                         continue;
1404                 }
1405
1406                 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1407                         struct vfio_iommu_type1_dma_unmap nb_unmap;
1408
1409                         if (dma_last == dma) {
1410                                 BUG_ON(++retries > 10);
1411                         } else {
1412                                 dma_last = dma;
1413                                 retries = 0;
1414                         }
1415
1416                         nb_unmap.iova = dma->iova;
1417                         nb_unmap.size = dma->size;
1418
1419                         /*
1420                          * Notify anyone (mdev vendor drivers) to invalidate and
1421                          * unmap iovas within the range we're about to unmap.
1422                          * Vendor drivers MUST unpin pages in response to an
1423                          * invalidation.
1424                          */
1425                         mutex_unlock(&iommu->lock);
1426                         blocking_notifier_call_chain(&iommu->notifier,
1427                                                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1428                                                     &nb_unmap);
1429                         mutex_lock(&iommu->lock);
1430                         goto again;
1431                 }
1432
1433                 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1434                         ret = update_user_bitmap(bitmap->data, iommu, dma,
1435                                                  iova, pgsize);
1436                         if (ret)
1437                                 break;
1438                 }
1439
1440                 unmapped += dma->size;
1441                 n = rb_next(n);
1442                 vfio_remove_dma(iommu, dma);
1443         }
1444
1445 unlock:
1446         mutex_unlock(&iommu->lock);
1447
1448         /* Report how much was unmapped */
1449         unmap->size = unmapped;
1450
1451         return ret;
1452 }
1453
1454 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1455                           unsigned long pfn, long npage, int prot)
1456 {
1457         struct vfio_domain *d;
1458         int ret;
1459
1460         list_for_each_entry(d, &iommu->domain_list, next) {
1461                 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1462                                 npage << PAGE_SHIFT, prot | d->prot);
1463                 if (ret)
1464                         goto unwind;
1465
1466                 cond_resched();
1467         }
1468
1469         return 0;
1470
1471 unwind:
1472         list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1473                 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1474                 cond_resched();
1475         }
1476
1477         return ret;
1478 }
1479
1480 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1481                             size_t map_size)
1482 {
1483         dma_addr_t iova = dma->iova;
1484         unsigned long vaddr = dma->vaddr;
1485         struct vfio_batch batch;
1486         size_t size = map_size;
1487         long npage;
1488         unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1489         int ret = 0;
1490
1491         vfio_batch_init(&batch);
1492
1493         while (size) {
1494                 /* Pin a contiguous chunk of memory */
1495                 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1496                                               size >> PAGE_SHIFT, &pfn, limit,
1497                                               &batch);
1498                 if (npage <= 0) {
1499                         WARN_ON(!npage);
1500                         ret = (int)npage;
1501                         break;
1502                 }
1503
1504                 /* Map it! */
1505                 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1506                                      dma->prot);
1507                 if (ret) {
1508                         vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1509                                                 npage, true);
1510                         vfio_batch_unpin(&batch, dma);
1511                         break;
1512                 }
1513
1514                 size -= npage << PAGE_SHIFT;
1515                 dma->size += npage << PAGE_SHIFT;
1516         }
1517
1518         vfio_batch_fini(&batch);
1519         dma->iommu_mapped = true;
1520
1521         if (ret)
1522                 vfio_remove_dma(iommu, dma);
1523
1524         return ret;
1525 }
1526
1527 /*
1528  * Check dma map request is within a valid iova range
1529  */
1530 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1531                                       dma_addr_t start, dma_addr_t end)
1532 {
1533         struct list_head *iova = &iommu->iova_list;
1534         struct vfio_iova *node;
1535
1536         list_for_each_entry(node, iova, list) {
1537                 if (start >= node->start && end <= node->end)
1538                         return true;
1539         }
1540
1541         /*
1542          * Check for list_empty() as well since a container with
1543          * a single mdev device will have an empty list.
1544          */
1545         return list_empty(iova);
1546 }
1547
1548 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1549                            struct vfio_iommu_type1_dma_map *map)
1550 {
1551         bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1552         dma_addr_t iova = map->iova;
1553         unsigned long vaddr = map->vaddr;
1554         size_t size = map->size;
1555         int ret = 0, prot = 0;
1556         size_t pgsize;
1557         struct vfio_dma *dma;
1558
1559         /* Verify that none of our __u64 fields overflow */
1560         if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1561                 return -EINVAL;
1562
1563         /* READ/WRITE from device perspective */
1564         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1565                 prot |= IOMMU_WRITE;
1566         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1567                 prot |= IOMMU_READ;
1568
1569         if ((prot && set_vaddr) || (!prot && !set_vaddr))
1570                 return -EINVAL;
1571
1572         mutex_lock(&iommu->lock);
1573
1574         pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1575
1576         WARN_ON((pgsize - 1) & PAGE_MASK);
1577
1578         if (!size || (size | iova | vaddr) & (pgsize - 1)) {
1579                 ret = -EINVAL;
1580                 goto out_unlock;
1581         }
1582
1583         /* Don't allow IOVA or virtual address wrap */
1584         if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1585                 ret = -EINVAL;
1586                 goto out_unlock;
1587         }
1588
1589         dma = vfio_find_dma(iommu, iova, size);
1590         if (set_vaddr) {
1591                 if (!dma) {
1592                         ret = -ENOENT;
1593                 } else if (!dma->vaddr_invalid || dma->iova != iova ||
1594                            dma->size != size) {
1595                         ret = -EINVAL;
1596                 } else {
1597                         dma->vaddr = vaddr;
1598                         dma->vaddr_invalid = false;
1599                         iommu->vaddr_invalid_count--;
1600                         wake_up_all(&iommu->vaddr_wait);
1601                 }
1602                 goto out_unlock;
1603         } else if (dma) {
1604                 ret = -EEXIST;
1605                 goto out_unlock;
1606         }
1607
1608         if (!iommu->dma_avail) {
1609                 ret = -ENOSPC;
1610                 goto out_unlock;
1611         }
1612
1613         if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1614                 ret = -EINVAL;
1615                 goto out_unlock;
1616         }
1617
1618         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1619         if (!dma) {
1620                 ret = -ENOMEM;
1621                 goto out_unlock;
1622         }
1623
1624         iommu->dma_avail--;
1625         dma->iova = iova;
1626         dma->vaddr = vaddr;
1627         dma->prot = prot;
1628
1629         /*
1630          * We need to be able to both add to a task's locked memory and test
1631          * against the locked memory limit and we need to be able to do both
1632          * outside of this call path as pinning can be asynchronous via the
1633          * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1634          * task_struct and VM locked pages requires an mm_struct, however
1635          * holding an indefinite mm reference is not recommended, therefore we
1636          * only hold a reference to a task.  We could hold a reference to
1637          * current, however QEMU uses this call path through vCPU threads,
1638          * which can be killed resulting in a NULL mm and failure in the unmap
1639          * path when called via a different thread.  Avoid this problem by
1640          * using the group_leader as threads within the same group require
1641          * both CLONE_THREAD and CLONE_VM and will therefore use the same
1642          * mm_struct.
1643          *
1644          * Previously we also used the task for testing CAP_IPC_LOCK at the
1645          * time of pinning and accounting, however has_capability() makes use
1646          * of real_cred, a copy-on-write field, so we can't guarantee that it
1647          * matches group_leader, or in fact that it might not change by the
1648          * time it's evaluated.  If a process were to call MAP_DMA with
1649          * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1650          * possibly see different results for an iommu_mapped vfio_dma vs
1651          * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
1652          * time of calling MAP_DMA.
1653          */
1654         get_task_struct(current->group_leader);
1655         dma->task = current->group_leader;
1656         dma->lock_cap = capable(CAP_IPC_LOCK);
1657
1658         dma->pfn_list = RB_ROOT;
1659
1660         /* Insert zero-sized and grow as we map chunks of it */
1661         vfio_link_dma(iommu, dma);
1662
1663         /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1664         if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1665                 dma->size = size;
1666         else
1667                 ret = vfio_pin_map_dma(iommu, dma, size);
1668
1669         if (!ret && iommu->dirty_page_tracking) {
1670                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1671                 if (ret)
1672                         vfio_remove_dma(iommu, dma);
1673         }
1674
1675 out_unlock:
1676         mutex_unlock(&iommu->lock);
1677         return ret;
1678 }
1679
1680 static int vfio_bus_type(struct device *dev, void *data)
1681 {
1682         struct bus_type **bus = data;
1683
1684         if (*bus && *bus != dev->bus)
1685                 return -EINVAL;
1686
1687         *bus = dev->bus;
1688
1689         return 0;
1690 }
1691
1692 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1693                              struct vfio_domain *domain)
1694 {
1695         struct vfio_batch batch;
1696         struct vfio_domain *d = NULL;
1697         struct rb_node *n;
1698         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1699         int ret;
1700
1701         ret = vfio_wait_all_valid(iommu);
1702         if (ret < 0)
1703                 return ret;
1704
1705         /* Arbitrarily pick the first domain in the list for lookups */
1706         if (!list_empty(&iommu->domain_list))
1707                 d = list_first_entry(&iommu->domain_list,
1708                                      struct vfio_domain, next);
1709
1710         vfio_batch_init(&batch);
1711
1712         n = rb_first(&iommu->dma_list);
1713
1714         for (; n; n = rb_next(n)) {
1715                 struct vfio_dma *dma;
1716                 dma_addr_t iova;
1717
1718                 dma = rb_entry(n, struct vfio_dma, node);
1719                 iova = dma->iova;
1720
1721                 while (iova < dma->iova + dma->size) {
1722                         phys_addr_t phys;
1723                         size_t size;
1724
1725                         if (dma->iommu_mapped) {
1726                                 phys_addr_t p;
1727                                 dma_addr_t i;
1728
1729                                 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1730                                         ret = -EINVAL;
1731                                         goto unwind;
1732                                 }
1733
1734                                 phys = iommu_iova_to_phys(d->domain, iova);
1735
1736                                 if (WARN_ON(!phys)) {
1737                                         iova += PAGE_SIZE;
1738                                         continue;
1739                                 }
1740
1741                                 size = PAGE_SIZE;
1742                                 p = phys + size;
1743                                 i = iova + size;
1744                                 while (i < dma->iova + dma->size &&
1745                                        p == iommu_iova_to_phys(d->domain, i)) {
1746                                         size += PAGE_SIZE;
1747                                         p += PAGE_SIZE;
1748                                         i += PAGE_SIZE;
1749                                 }
1750                         } else {
1751                                 unsigned long pfn;
1752                                 unsigned long vaddr = dma->vaddr +
1753                                                      (iova - dma->iova);
1754                                 size_t n = dma->iova + dma->size - iova;
1755                                 long npage;
1756
1757                                 npage = vfio_pin_pages_remote(dma, vaddr,
1758                                                               n >> PAGE_SHIFT,
1759                                                               &pfn, limit,
1760                                                               &batch);
1761                                 if (npage <= 0) {
1762                                         WARN_ON(!npage);
1763                                         ret = (int)npage;
1764                                         goto unwind;
1765                                 }
1766
1767                                 phys = pfn << PAGE_SHIFT;
1768                                 size = npage << PAGE_SHIFT;
1769                         }
1770
1771                         ret = iommu_map(domain->domain, iova, phys,
1772                                         size, dma->prot | domain->prot);
1773                         if (ret) {
1774                                 if (!dma->iommu_mapped) {
1775                                         vfio_unpin_pages_remote(dma, iova,
1776                                                         phys >> PAGE_SHIFT,
1777                                                         size >> PAGE_SHIFT,
1778                                                         true);
1779                                         vfio_batch_unpin(&batch, dma);
1780                                 }
1781                                 goto unwind;
1782                         }
1783
1784                         iova += size;
1785                 }
1786         }
1787
1788         /* All dmas are now mapped, defer to second tree walk for unwind */
1789         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1790                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1791
1792                 dma->iommu_mapped = true;
1793         }
1794
1795         vfio_batch_fini(&batch);
1796         return 0;
1797
1798 unwind:
1799         for (; n; n = rb_prev(n)) {
1800                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1801                 dma_addr_t iova;
1802
1803                 if (dma->iommu_mapped) {
1804                         iommu_unmap(domain->domain, dma->iova, dma->size);
1805                         continue;
1806                 }
1807
1808                 iova = dma->iova;
1809                 while (iova < dma->iova + dma->size) {
1810                         phys_addr_t phys, p;
1811                         size_t size;
1812                         dma_addr_t i;
1813
1814                         phys = iommu_iova_to_phys(domain->domain, iova);
1815                         if (!phys) {
1816                                 iova += PAGE_SIZE;
1817                                 continue;
1818                         }
1819
1820                         size = PAGE_SIZE;
1821                         p = phys + size;
1822                         i = iova + size;
1823                         while (i < dma->iova + dma->size &&
1824                                p == iommu_iova_to_phys(domain->domain, i)) {
1825                                 size += PAGE_SIZE;
1826                                 p += PAGE_SIZE;
1827                                 i += PAGE_SIZE;
1828                         }
1829
1830                         iommu_unmap(domain->domain, iova, size);
1831                         vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1832                                                 size >> PAGE_SHIFT, true);
1833                 }
1834         }
1835
1836         vfio_batch_fini(&batch);
1837         return ret;
1838 }
1839
1840 /*
1841  * We change our unmap behavior slightly depending on whether the IOMMU
1842  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1843  * for practically any contiguous power-of-two mapping we give it.  This means
1844  * we don't need to look for contiguous chunks ourselves to make unmapping
1845  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1846  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1847  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1848  * hugetlbfs is in use.
1849  */
1850 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1851 {
1852         struct page *pages;
1853         int ret, order = get_order(PAGE_SIZE * 2);
1854
1855         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1856         if (!pages)
1857                 return;
1858
1859         ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1860                         IOMMU_READ | IOMMU_WRITE | domain->prot);
1861         if (!ret) {
1862                 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1863
1864                 if (unmapped == PAGE_SIZE)
1865                         iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1866                 else
1867                         domain->fgsp = true;
1868         }
1869
1870         __free_pages(pages, order);
1871 }
1872
1873 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1874                                            struct iommu_group *iommu_group)
1875 {
1876         struct vfio_group *g;
1877
1878         list_for_each_entry(g, &domain->group_list, next) {
1879                 if (g->iommu_group == iommu_group)
1880                         return g;
1881         }
1882
1883         return NULL;
1884 }
1885
1886 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1887                                                struct iommu_group *iommu_group)
1888 {
1889         struct vfio_domain *domain;
1890         struct vfio_group *group = NULL;
1891
1892         list_for_each_entry(domain, &iommu->domain_list, next) {
1893                 group = find_iommu_group(domain, iommu_group);
1894                 if (group)
1895                         return group;
1896         }
1897
1898         if (iommu->external_domain)
1899                 group = find_iommu_group(iommu->external_domain, iommu_group);
1900
1901         return group;
1902 }
1903
1904 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1905                                   phys_addr_t *base)
1906 {
1907         struct iommu_resv_region *region;
1908         bool ret = false;
1909
1910         list_for_each_entry(region, group_resv_regions, list) {
1911                 /*
1912                  * The presence of any 'real' MSI regions should take
1913                  * precedence over the software-managed one if the
1914                  * IOMMU driver happens to advertise both types.
1915                  */
1916                 if (region->type == IOMMU_RESV_MSI) {
1917                         ret = false;
1918                         break;
1919                 }
1920
1921                 if (region->type == IOMMU_RESV_SW_MSI) {
1922                         *base = region->start;
1923                         ret = true;
1924                 }
1925         }
1926
1927         return ret;
1928 }
1929
1930 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1931 {
1932         struct device *(*fn)(struct device *dev);
1933         struct device *iommu_device;
1934
1935         fn = symbol_get(mdev_get_iommu_device);
1936         if (fn) {
1937                 iommu_device = fn(dev);
1938                 symbol_put(mdev_get_iommu_device);
1939
1940                 return iommu_device;
1941         }
1942
1943         return NULL;
1944 }
1945
1946 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1947 {
1948         struct iommu_domain *domain = data;
1949         struct device *iommu_device;
1950
1951         iommu_device = vfio_mdev_get_iommu_device(dev);
1952         if (iommu_device) {
1953                 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1954                         return iommu_aux_attach_device(domain, iommu_device);
1955                 else
1956                         return iommu_attach_device(domain, iommu_device);
1957         }
1958
1959         return -EINVAL;
1960 }
1961
1962 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1963 {
1964         struct iommu_domain *domain = data;
1965         struct device *iommu_device;
1966
1967         iommu_device = vfio_mdev_get_iommu_device(dev);
1968         if (iommu_device) {
1969                 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1970                         iommu_aux_detach_device(domain, iommu_device);
1971                 else
1972                         iommu_detach_device(domain, iommu_device);
1973         }
1974
1975         return 0;
1976 }
1977
1978 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1979                                    struct vfio_group *group)
1980 {
1981         if (group->mdev_group)
1982                 return iommu_group_for_each_dev(group->iommu_group,
1983                                                 domain->domain,
1984                                                 vfio_mdev_attach_domain);
1985         else
1986                 return iommu_attach_group(domain->domain, group->iommu_group);
1987 }
1988
1989 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1990                                     struct vfio_group *group)
1991 {
1992         if (group->mdev_group)
1993                 iommu_group_for_each_dev(group->iommu_group, domain->domain,
1994                                          vfio_mdev_detach_domain);
1995         else
1996                 iommu_detach_group(domain->domain, group->iommu_group);
1997 }
1998
1999 static bool vfio_bus_is_mdev(struct bus_type *bus)
2000 {
2001         struct bus_type *mdev_bus;
2002         bool ret = false;
2003
2004         mdev_bus = symbol_get(mdev_bus_type);
2005         if (mdev_bus) {
2006                 ret = (bus == mdev_bus);
2007                 symbol_put(mdev_bus_type);
2008         }
2009
2010         return ret;
2011 }
2012
2013 static int vfio_mdev_iommu_device(struct device *dev, void *data)
2014 {
2015         struct device **old = data, *new;
2016
2017         new = vfio_mdev_get_iommu_device(dev);
2018         if (!new || (*old && *old != new))
2019                 return -EINVAL;
2020
2021         *old = new;
2022
2023         return 0;
2024 }
2025
2026 /*
2027  * This is a helper function to insert an address range to iova list.
2028  * The list is initially created with a single entry corresponding to
2029  * the IOMMU domain geometry to which the device group is attached.
2030  * The list aperture gets modified when a new domain is added to the
2031  * container if the new aperture doesn't conflict with the current one
2032  * or with any existing dma mappings. The list is also modified to
2033  * exclude any reserved regions associated with the device group.
2034  */
2035 static int vfio_iommu_iova_insert(struct list_head *head,
2036                                   dma_addr_t start, dma_addr_t end)
2037 {
2038         struct vfio_iova *region;
2039
2040         region = kmalloc(sizeof(*region), GFP_KERNEL);
2041         if (!region)
2042                 return -ENOMEM;
2043
2044         INIT_LIST_HEAD(&region->list);
2045         region->start = start;
2046         region->end = end;
2047
2048         list_add_tail(&region->list, head);
2049         return 0;
2050 }
2051
2052 /*
2053  * Check the new iommu aperture conflicts with existing aper or with any
2054  * existing dma mappings.
2055  */
2056 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
2057                                      dma_addr_t start, dma_addr_t end)
2058 {
2059         struct vfio_iova *first, *last;
2060         struct list_head *iova = &iommu->iova_list;
2061
2062         if (list_empty(iova))
2063                 return false;
2064
2065         /* Disjoint sets, return conflict */
2066         first = list_first_entry(iova, struct vfio_iova, list);
2067         last = list_last_entry(iova, struct vfio_iova, list);
2068         if (start > last->end || end < first->start)
2069                 return true;
2070
2071         /* Check for any existing dma mappings below the new start */
2072         if (start > first->start) {
2073                 if (vfio_find_dma(iommu, first->start, start - first->start))
2074                         return true;
2075         }
2076
2077         /* Check for any existing dma mappings beyond the new end */
2078         if (end < last->end) {
2079                 if (vfio_find_dma(iommu, end + 1, last->end - end))
2080                         return true;
2081         }
2082
2083         return false;
2084 }
2085
2086 /*
2087  * Resize iommu iova aperture window. This is called only if the new
2088  * aperture has no conflict with existing aperture and dma mappings.
2089  */
2090 static int vfio_iommu_aper_resize(struct list_head *iova,
2091                                   dma_addr_t start, dma_addr_t end)
2092 {
2093         struct vfio_iova *node, *next;
2094
2095         if (list_empty(iova))
2096                 return vfio_iommu_iova_insert(iova, start, end);
2097
2098         /* Adjust iova list start */
2099         list_for_each_entry_safe(node, next, iova, list) {
2100                 if (start < node->start)
2101                         break;
2102                 if (start >= node->start && start < node->end) {
2103                         node->start = start;
2104                         break;
2105                 }
2106                 /* Delete nodes before new start */
2107                 list_del(&node->list);
2108                 kfree(node);
2109         }
2110
2111         /* Adjust iova list end */
2112         list_for_each_entry_safe(node, next, iova, list) {
2113                 if (end > node->end)
2114                         continue;
2115                 if (end > node->start && end <= node->end) {
2116                         node->end = end;
2117                         continue;
2118                 }
2119                 /* Delete nodes after new end */
2120                 list_del(&node->list);
2121                 kfree(node);
2122         }
2123
2124         return 0;
2125 }
2126
2127 /*
2128  * Check reserved region conflicts with existing dma mappings
2129  */
2130 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2131                                      struct list_head *resv_regions)
2132 {
2133         struct iommu_resv_region *region;
2134
2135         /* Check for conflict with existing dma mappings */
2136         list_for_each_entry(region, resv_regions, list) {
2137                 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2138                         continue;
2139
2140                 if (vfio_find_dma(iommu, region->start, region->length))
2141                         return true;
2142         }
2143
2144         return false;
2145 }
2146
2147 /*
2148  * Check iova region overlap with  reserved regions and
2149  * exclude them from the iommu iova range
2150  */
2151 static int vfio_iommu_resv_exclude(struct list_head *iova,
2152                                    struct list_head *resv_regions)
2153 {
2154         struct iommu_resv_region *resv;
2155         struct vfio_iova *n, *next;
2156
2157         list_for_each_entry(resv, resv_regions, list) {
2158                 phys_addr_t start, end;
2159
2160                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2161                         continue;
2162
2163                 start = resv->start;
2164                 end = resv->start + resv->length - 1;
2165
2166                 list_for_each_entry_safe(n, next, iova, list) {
2167                         int ret = 0;
2168
2169                         /* No overlap */
2170                         if (start > n->end || end < n->start)
2171                                 continue;
2172                         /*
2173                          * Insert a new node if current node overlaps with the
2174                          * reserve region to exlude that from valid iova range.
2175                          * Note that, new node is inserted before the current
2176                          * node and finally the current node is deleted keeping
2177                          * the list updated and sorted.
2178                          */
2179                         if (start > n->start)
2180                                 ret = vfio_iommu_iova_insert(&n->list, n->start,
2181                                                              start - 1);
2182                         if (!ret && end < n->end)
2183                                 ret = vfio_iommu_iova_insert(&n->list, end + 1,
2184                                                              n->end);
2185                         if (ret)
2186                                 return ret;
2187
2188                         list_del(&n->list);
2189                         kfree(n);
2190                 }
2191         }
2192
2193         if (list_empty(iova))
2194                 return -EINVAL;
2195
2196         return 0;
2197 }
2198
2199 static void vfio_iommu_resv_free(struct list_head *resv_regions)
2200 {
2201         struct iommu_resv_region *n, *next;
2202
2203         list_for_each_entry_safe(n, next, resv_regions, list) {
2204                 list_del(&n->list);
2205                 kfree(n);
2206         }
2207 }
2208
2209 static void vfio_iommu_iova_free(struct list_head *iova)
2210 {
2211         struct vfio_iova *n, *next;
2212
2213         list_for_each_entry_safe(n, next, iova, list) {
2214                 list_del(&n->list);
2215                 kfree(n);
2216         }
2217 }
2218
2219 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2220                                     struct list_head *iova_copy)
2221 {
2222         struct list_head *iova = &iommu->iova_list;
2223         struct vfio_iova *n;
2224         int ret;
2225
2226         list_for_each_entry(n, iova, list) {
2227                 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
2228                 if (ret)
2229                         goto out_free;
2230         }
2231
2232         return 0;
2233
2234 out_free:
2235         vfio_iommu_iova_free(iova_copy);
2236         return ret;
2237 }
2238
2239 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2240                                         struct list_head *iova_copy)
2241 {
2242         struct list_head *iova = &iommu->iova_list;
2243
2244         vfio_iommu_iova_free(iova);
2245
2246         list_splice_tail(iova_copy, iova);
2247 }
2248
2249 static int vfio_iommu_type1_attach_group(void *iommu_data,
2250                                          struct iommu_group *iommu_group)
2251 {
2252         struct vfio_iommu *iommu = iommu_data;
2253         struct vfio_group *group;
2254         struct vfio_domain *domain, *d;
2255         struct bus_type *bus = NULL;
2256         int ret;
2257         bool resv_msi, msi_remap;
2258         phys_addr_t resv_msi_base = 0;
2259         struct iommu_domain_geometry geo;
2260         LIST_HEAD(iova_copy);
2261         LIST_HEAD(group_resv_regions);
2262
2263         mutex_lock(&iommu->lock);
2264
2265         /* Check for duplicates */
2266         if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2267                 mutex_unlock(&iommu->lock);
2268                 return -EINVAL;
2269         }
2270
2271         group = kzalloc(sizeof(*group), GFP_KERNEL);
2272         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2273         if (!group || !domain) {
2274                 ret = -ENOMEM;
2275                 goto out_free;
2276         }
2277
2278         group->iommu_group = iommu_group;
2279
2280         /* Determine bus_type in order to allocate a domain */
2281         ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2282         if (ret)
2283                 goto out_free;
2284
2285         if (vfio_bus_is_mdev(bus)) {
2286                 struct device *iommu_device = NULL;
2287
2288                 group->mdev_group = true;
2289
2290                 /* Determine the isolation type */
2291                 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2292                                                vfio_mdev_iommu_device);
2293                 if (ret || !iommu_device) {
2294                         if (!iommu->external_domain) {
2295                                 INIT_LIST_HEAD(&domain->group_list);
2296                                 iommu->external_domain = domain;
2297                                 vfio_update_pgsize_bitmap(iommu);
2298                         } else {
2299                                 kfree(domain);
2300                         }
2301
2302                         list_add(&group->next,
2303                                  &iommu->external_domain->group_list);
2304                         /*
2305                          * Non-iommu backed group cannot dirty memory directly,
2306                          * it can only use interfaces that provide dirty
2307                          * tracking.
2308                          * The iommu scope can only be promoted with the
2309                          * addition of a dirty tracking group.
2310                          */
2311                         group->pinned_page_dirty_scope = true;
2312                         mutex_unlock(&iommu->lock);
2313
2314                         return 0;
2315                 }
2316
2317                 bus = iommu_device->bus;
2318         }
2319
2320         domain->domain = iommu_domain_alloc(bus);
2321         if (!domain->domain) {
2322                 ret = -EIO;
2323                 goto out_free;
2324         }
2325
2326         if (iommu->nesting) {
2327                 int attr = 1;
2328
2329                 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
2330                                             &attr);
2331                 if (ret)
2332                         goto out_domain;
2333         }
2334
2335         ret = vfio_iommu_attach_group(domain, group);
2336         if (ret)
2337                 goto out_domain;
2338
2339         /* Get aperture info */
2340         iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
2341
2342         if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
2343                                      geo.aperture_end)) {
2344                 ret = -EINVAL;
2345                 goto out_detach;
2346         }
2347
2348         ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2349         if (ret)
2350                 goto out_detach;
2351
2352         if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2353                 ret = -EINVAL;
2354                 goto out_detach;
2355         }
2356
2357         /*
2358          * We don't want to work on the original iova list as the list
2359          * gets modified and in case of failure we have to retain the
2360          * original list. Get a copy here.
2361          */
2362         ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2363         if (ret)
2364                 goto out_detach;
2365
2366         ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
2367                                      geo.aperture_end);
2368         if (ret)
2369                 goto out_detach;
2370
2371         ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2372         if (ret)
2373                 goto out_detach;
2374
2375         resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2376
2377         INIT_LIST_HEAD(&domain->group_list);
2378         list_add(&group->next, &domain->group_list);
2379
2380         msi_remap = irq_domain_check_msi_remap() ||
2381                     iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2382
2383         if (!allow_unsafe_interrupts && !msi_remap) {
2384                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2385                        __func__);
2386                 ret = -EPERM;
2387                 goto out_detach;
2388         }
2389
2390         if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2391                 domain->prot |= IOMMU_CACHE;
2392
2393         /*
2394          * Try to match an existing compatible domain.  We don't want to
2395          * preclude an IOMMU driver supporting multiple bus_types and being
2396          * able to include different bus_types in the same IOMMU domain, so
2397          * we test whether the domains use the same iommu_ops rather than
2398          * testing if they're on the same bus_type.
2399          */
2400         list_for_each_entry(d, &iommu->domain_list, next) {
2401                 if (d->domain->ops == domain->domain->ops &&
2402                     d->prot == domain->prot) {
2403                         vfio_iommu_detach_group(domain, group);
2404                         if (!vfio_iommu_attach_group(d, group)) {
2405                                 list_add(&group->next, &d->group_list);
2406                                 iommu_domain_free(domain->domain);
2407                                 kfree(domain);
2408                                 goto done;
2409                         }
2410
2411                         ret = vfio_iommu_attach_group(domain, group);
2412                         if (ret)
2413                                 goto out_domain;
2414                 }
2415         }
2416
2417         vfio_test_domain_fgsp(domain);
2418
2419         /* replay mappings on new domains */
2420         ret = vfio_iommu_replay(iommu, domain);
2421         if (ret)
2422                 goto out_detach;
2423
2424         if (resv_msi) {
2425                 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2426                 if (ret && ret != -ENODEV)
2427                         goto out_detach;
2428         }
2429
2430         list_add(&domain->next, &iommu->domain_list);
2431         vfio_update_pgsize_bitmap(iommu);
2432 done:
2433         /* Delete the old one and insert new iova list */
2434         vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2435
2436         /*
2437          * An iommu backed group can dirty memory directly and therefore
2438          * demotes the iommu scope until it declares itself dirty tracking
2439          * capable via the page pinning interface.
2440          */
2441         iommu->num_non_pinned_groups++;
2442         mutex_unlock(&iommu->lock);
2443         vfio_iommu_resv_free(&group_resv_regions);
2444
2445         return 0;
2446
2447 out_detach:
2448         vfio_iommu_detach_group(domain, group);
2449 out_domain:
2450         iommu_domain_free(domain->domain);
2451         vfio_iommu_iova_free(&iova_copy);
2452         vfio_iommu_resv_free(&group_resv_regions);
2453 out_free:
2454         kfree(domain);
2455         kfree(group);
2456         mutex_unlock(&iommu->lock);
2457         return ret;
2458 }
2459
2460 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2461 {
2462         struct rb_node *node;
2463
2464         while ((node = rb_first(&iommu->dma_list)))
2465                 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2466 }
2467
2468 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2469 {
2470         struct rb_node *n, *p;
2471
2472         n = rb_first(&iommu->dma_list);
2473         for (; n; n = rb_next(n)) {
2474                 struct vfio_dma *dma;
2475                 long locked = 0, unlocked = 0;
2476
2477                 dma = rb_entry(n, struct vfio_dma, node);
2478                 unlocked += vfio_unmap_unpin(iommu, dma, false);
2479                 p = rb_first(&dma->pfn_list);
2480                 for (; p; p = rb_next(p)) {
2481                         struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2482                                                          node);
2483
2484                         if (!is_invalid_reserved_pfn(vpfn->pfn))
2485                                 locked++;
2486                 }
2487                 vfio_lock_acct(dma, locked - unlocked, true);
2488         }
2489 }
2490
2491 /*
2492  * Called when a domain is removed in detach. It is possible that
2493  * the removed domain decided the iova aperture window. Modify the
2494  * iova aperture with the smallest window among existing domains.
2495  */
2496 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2497                                    struct list_head *iova_copy)
2498 {
2499         struct vfio_domain *domain;
2500         struct iommu_domain_geometry geo;
2501         struct vfio_iova *node;
2502         dma_addr_t start = 0;
2503         dma_addr_t end = (dma_addr_t)~0;
2504
2505         if (list_empty(iova_copy))
2506                 return;
2507
2508         list_for_each_entry(domain, &iommu->domain_list, next) {
2509                 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
2510                                       &geo);
2511                 if (geo.aperture_start > start)
2512                         start = geo.aperture_start;
2513                 if (geo.aperture_end < end)
2514                         end = geo.aperture_end;
2515         }
2516
2517         /* Modify aperture limits. The new aper is either same or bigger */
2518         node = list_first_entry(iova_copy, struct vfio_iova, list);
2519         node->start = start;
2520         node = list_last_entry(iova_copy, struct vfio_iova, list);
2521         node->end = end;
2522 }
2523
2524 /*
2525  * Called when a group is detached. The reserved regions for that
2526  * group can be part of valid iova now. But since reserved regions
2527  * may be duplicated among groups, populate the iova valid regions
2528  * list again.
2529  */
2530 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2531                                    struct list_head *iova_copy)
2532 {
2533         struct vfio_domain *d;
2534         struct vfio_group *g;
2535         struct vfio_iova *node;
2536         dma_addr_t start, end;
2537         LIST_HEAD(resv_regions);
2538         int ret;
2539
2540         if (list_empty(iova_copy))
2541                 return -EINVAL;
2542
2543         list_for_each_entry(d, &iommu->domain_list, next) {
2544                 list_for_each_entry(g, &d->group_list, next) {
2545                         ret = iommu_get_group_resv_regions(g->iommu_group,
2546                                                            &resv_regions);
2547                         if (ret)
2548                                 goto done;
2549                 }
2550         }
2551
2552         node = list_first_entry(iova_copy, struct vfio_iova, list);
2553         start = node->start;
2554         node = list_last_entry(iova_copy, struct vfio_iova, list);
2555         end = node->end;
2556
2557         /* purge the iova list and create new one */
2558         vfio_iommu_iova_free(iova_copy);
2559
2560         ret = vfio_iommu_aper_resize(iova_copy, start, end);
2561         if (ret)
2562                 goto done;
2563
2564         /* Exclude current reserved regions from iova ranges */
2565         ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2566 done:
2567         vfio_iommu_resv_free(&resv_regions);
2568         return ret;
2569 }
2570
2571 static void vfio_iommu_type1_detach_group(void *iommu_data,
2572                                           struct iommu_group *iommu_group)
2573 {
2574         struct vfio_iommu *iommu = iommu_data;
2575         struct vfio_domain *domain;
2576         struct vfio_group *group;
2577         bool update_dirty_scope = false;
2578         LIST_HEAD(iova_copy);
2579
2580         mutex_lock(&iommu->lock);
2581
2582         if (iommu->external_domain) {
2583                 group = find_iommu_group(iommu->external_domain, iommu_group);
2584                 if (group) {
2585                         update_dirty_scope = !group->pinned_page_dirty_scope;
2586                         list_del(&group->next);
2587                         kfree(group);
2588
2589                         if (list_empty(&iommu->external_domain->group_list)) {
2590                                 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2591                                         WARN_ON(iommu->notifier.head);
2592                                         vfio_iommu_unmap_unpin_all(iommu);
2593                                 }
2594
2595                                 kfree(iommu->external_domain);
2596                                 iommu->external_domain = NULL;
2597                         }
2598                         goto detach_group_done;
2599                 }
2600         }
2601
2602         /*
2603          * Get a copy of iova list. This will be used to update
2604          * and to replace the current one later. Please note that
2605          * we will leave the original list as it is if update fails.
2606          */
2607         vfio_iommu_iova_get_copy(iommu, &iova_copy);
2608
2609         list_for_each_entry(domain, &iommu->domain_list, next) {
2610                 group = find_iommu_group(domain, iommu_group);
2611                 if (!group)
2612                         continue;
2613
2614                 vfio_iommu_detach_group(domain, group);
2615                 update_dirty_scope = !group->pinned_page_dirty_scope;
2616                 list_del(&group->next);
2617                 kfree(group);
2618                 /*
2619                  * Group ownership provides privilege, if the group list is
2620                  * empty, the domain goes away. If it's the last domain with
2621                  * iommu and external domain doesn't exist, then all the
2622                  * mappings go away too. If it's the last domain with iommu and
2623                  * external domain exist, update accounting
2624                  */
2625                 if (list_empty(&domain->group_list)) {
2626                         if (list_is_singular(&iommu->domain_list)) {
2627                                 if (!iommu->external_domain) {
2628                                         WARN_ON(iommu->notifier.head);
2629                                         vfio_iommu_unmap_unpin_all(iommu);
2630                                 } else {
2631                                         vfio_iommu_unmap_unpin_reaccount(iommu);
2632                                 }
2633                         }
2634                         iommu_domain_free(domain->domain);
2635                         list_del(&domain->next);
2636                         kfree(domain);
2637                         vfio_iommu_aper_expand(iommu, &iova_copy);
2638                         vfio_update_pgsize_bitmap(iommu);
2639                 }
2640                 break;
2641         }
2642
2643         if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2644                 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2645         else
2646                 vfio_iommu_iova_free(&iova_copy);
2647
2648 detach_group_done:
2649         /*
2650          * Removal of a group without dirty tracking may allow the iommu scope
2651          * to be promoted.
2652          */
2653         if (update_dirty_scope) {
2654                 iommu->num_non_pinned_groups--;
2655                 if (iommu->dirty_page_tracking)
2656                         vfio_iommu_populate_bitmap_full(iommu);
2657         }
2658         mutex_unlock(&iommu->lock);
2659 }
2660
2661 static void *vfio_iommu_type1_open(unsigned long arg)
2662 {
2663         struct vfio_iommu *iommu;
2664
2665         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2666         if (!iommu)
2667                 return ERR_PTR(-ENOMEM);
2668
2669         switch (arg) {
2670         case VFIO_TYPE1_IOMMU:
2671                 break;
2672         case VFIO_TYPE1_NESTING_IOMMU:
2673                 iommu->nesting = true;
2674                 fallthrough;
2675         case VFIO_TYPE1v2_IOMMU:
2676                 iommu->v2 = true;
2677                 break;
2678         default:
2679                 kfree(iommu);
2680                 return ERR_PTR(-EINVAL);
2681         }
2682
2683         INIT_LIST_HEAD(&iommu->domain_list);
2684         INIT_LIST_HEAD(&iommu->iova_list);
2685         iommu->dma_list = RB_ROOT;
2686         iommu->dma_avail = dma_entry_limit;
2687         iommu->container_open = true;
2688         mutex_init(&iommu->lock);
2689         BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2690         init_waitqueue_head(&iommu->vaddr_wait);
2691
2692         return iommu;
2693 }
2694
2695 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2696 {
2697         struct vfio_group *group, *group_tmp;
2698
2699         list_for_each_entry_safe(group, group_tmp,
2700                                  &domain->group_list, next) {
2701                 if (!external)
2702                         vfio_iommu_detach_group(domain, group);
2703                 list_del(&group->next);
2704                 kfree(group);
2705         }
2706
2707         if (!external)
2708                 iommu_domain_free(domain->domain);
2709 }
2710
2711 static void vfio_iommu_type1_release(void *iommu_data)
2712 {
2713         struct vfio_iommu *iommu = iommu_data;
2714         struct vfio_domain *domain, *domain_tmp;
2715
2716         if (iommu->external_domain) {
2717                 vfio_release_domain(iommu->external_domain, true);
2718                 kfree(iommu->external_domain);
2719         }
2720
2721         vfio_iommu_unmap_unpin_all(iommu);
2722
2723         list_for_each_entry_safe(domain, domain_tmp,
2724                                  &iommu->domain_list, next) {
2725                 vfio_release_domain(domain, false);
2726                 list_del(&domain->next);
2727                 kfree(domain);
2728         }
2729
2730         vfio_iommu_iova_free(&iommu->iova_list);
2731
2732         kfree(iommu);
2733 }
2734
2735 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2736 {
2737         struct vfio_domain *domain;
2738         int ret = 1;
2739
2740         mutex_lock(&iommu->lock);
2741         list_for_each_entry(domain, &iommu->domain_list, next) {
2742                 if (!(domain->prot & IOMMU_CACHE)) {
2743                         ret = 0;
2744                         break;
2745                 }
2746         }
2747         mutex_unlock(&iommu->lock);
2748
2749         return ret;
2750 }
2751
2752 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2753                                             unsigned long arg)
2754 {
2755         switch (arg) {
2756         case VFIO_TYPE1_IOMMU:
2757         case VFIO_TYPE1v2_IOMMU:
2758         case VFIO_TYPE1_NESTING_IOMMU:
2759         case VFIO_UNMAP_ALL:
2760         case VFIO_UPDATE_VADDR:
2761                 return 1;
2762         case VFIO_DMA_CC_IOMMU:
2763                 if (!iommu)
2764                         return 0;
2765                 return vfio_domains_have_iommu_cache(iommu);
2766         default:
2767                 return 0;
2768         }
2769 }
2770
2771 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2772                  struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2773                  size_t size)
2774 {
2775         struct vfio_info_cap_header *header;
2776         struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2777
2778         header = vfio_info_cap_add(caps, size,
2779                                    VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2780         if (IS_ERR(header))
2781                 return PTR_ERR(header);
2782
2783         iova_cap = container_of(header,
2784                                 struct vfio_iommu_type1_info_cap_iova_range,
2785                                 header);
2786         iova_cap->nr_iovas = cap_iovas->nr_iovas;
2787         memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2788                cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2789         return 0;
2790 }
2791
2792 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2793                                       struct vfio_info_cap *caps)
2794 {
2795         struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2796         struct vfio_iova *iova;
2797         size_t size;
2798         int iovas = 0, i = 0, ret;
2799
2800         list_for_each_entry(iova, &iommu->iova_list, list)
2801                 iovas++;
2802
2803         if (!iovas) {
2804                 /*
2805                  * Return 0 as a container with a single mdev device
2806                  * will have an empty list
2807                  */
2808                 return 0;
2809         }
2810
2811         size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2812
2813         cap_iovas = kzalloc(size, GFP_KERNEL);
2814         if (!cap_iovas)
2815                 return -ENOMEM;
2816
2817         cap_iovas->nr_iovas = iovas;
2818
2819         list_for_each_entry(iova, &iommu->iova_list, list) {
2820                 cap_iovas->iova_ranges[i].start = iova->start;
2821                 cap_iovas->iova_ranges[i].end = iova->end;
2822                 i++;
2823         }
2824
2825         ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2826
2827         kfree(cap_iovas);
2828         return ret;
2829 }
2830
2831 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2832                                            struct vfio_info_cap *caps)
2833 {
2834         struct vfio_iommu_type1_info_cap_migration cap_mig;
2835
2836         cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2837         cap_mig.header.version = 1;
2838
2839         cap_mig.flags = 0;
2840         /* support minimum pgsize */
2841         cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2842         cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2843
2844         return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2845 }
2846
2847 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2848                                            struct vfio_info_cap *caps)
2849 {
2850         struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2851
2852         cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2853         cap_dma_avail.header.version = 1;
2854
2855         cap_dma_avail.avail = iommu->dma_avail;
2856
2857         return vfio_info_add_capability(caps, &cap_dma_avail.header,
2858                                         sizeof(cap_dma_avail));
2859 }
2860
2861 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2862                                      unsigned long arg)
2863 {
2864         struct vfio_iommu_type1_info info;
2865         unsigned long minsz;
2866         struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2867         unsigned long capsz;
2868         int ret;
2869
2870         minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2871
2872         /* For backward compatibility, cannot require this */
2873         capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2874
2875         if (copy_from_user(&info, (void __user *)arg, minsz))
2876                 return -EFAULT;
2877
2878         if (info.argsz < minsz)
2879                 return -EINVAL;
2880
2881         if (info.argsz >= capsz) {
2882                 minsz = capsz;
2883                 info.cap_offset = 0; /* output, no-recopy necessary */
2884         }
2885
2886         mutex_lock(&iommu->lock);
2887         info.flags = VFIO_IOMMU_INFO_PGSIZES;
2888
2889         info.iova_pgsizes = iommu->pgsize_bitmap;
2890
2891         ret = vfio_iommu_migration_build_caps(iommu, &caps);
2892
2893         if (!ret)
2894                 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2895
2896         if (!ret)
2897                 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2898
2899         mutex_unlock(&iommu->lock);
2900
2901         if (ret)
2902                 return ret;
2903
2904         if (caps.size) {
2905                 info.flags |= VFIO_IOMMU_INFO_CAPS;
2906
2907                 if (info.argsz < sizeof(info) + caps.size) {
2908                         info.argsz = sizeof(info) + caps.size;
2909                 } else {
2910                         vfio_info_cap_shift(&caps, sizeof(info));
2911                         if (copy_to_user((void __user *)arg +
2912                                         sizeof(info), caps.buf,
2913                                         caps.size)) {
2914                                 kfree(caps.buf);
2915                                 return -EFAULT;
2916                         }
2917                         info.cap_offset = sizeof(info);
2918                 }
2919
2920                 kfree(caps.buf);
2921         }
2922
2923         return copy_to_user((void __user *)arg, &info, minsz) ?
2924                         -EFAULT : 0;
2925 }
2926
2927 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2928                                     unsigned long arg)
2929 {
2930         struct vfio_iommu_type1_dma_map map;
2931         unsigned long minsz;
2932         uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE |
2933                         VFIO_DMA_MAP_FLAG_VADDR;
2934
2935         minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2936
2937         if (copy_from_user(&map, (void __user *)arg, minsz))
2938                 return -EFAULT;
2939
2940         if (map.argsz < minsz || map.flags & ~mask)
2941                 return -EINVAL;
2942
2943         return vfio_dma_do_map(iommu, &map);
2944 }
2945
2946 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2947                                       unsigned long arg)
2948 {
2949         struct vfio_iommu_type1_dma_unmap unmap;
2950         struct vfio_bitmap bitmap = { 0 };
2951         uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP |
2952                         VFIO_DMA_UNMAP_FLAG_VADDR |
2953                         VFIO_DMA_UNMAP_FLAG_ALL;
2954         unsigned long minsz;
2955         int ret;
2956
2957         minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2958
2959         if (copy_from_user(&unmap, (void __user *)arg, minsz))
2960                 return -EFAULT;
2961
2962         if (unmap.argsz < minsz || unmap.flags & ~mask)
2963                 return -EINVAL;
2964
2965         if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2966             (unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL |
2967                             VFIO_DMA_UNMAP_FLAG_VADDR)))
2968                 return -EINVAL;
2969
2970         if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2971                 unsigned long pgshift;
2972
2973                 if (unmap.argsz < (minsz + sizeof(bitmap)))
2974                         return -EINVAL;
2975
2976                 if (copy_from_user(&bitmap,
2977                                    (void __user *)(arg + minsz),
2978                                    sizeof(bitmap)))
2979                         return -EFAULT;
2980
2981                 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2982                         return -EINVAL;
2983
2984                 pgshift = __ffs(bitmap.pgsize);
2985                 ret = verify_bitmap_size(unmap.size >> pgshift,
2986                                          bitmap.size);
2987                 if (ret)
2988                         return ret;
2989         }
2990
2991         ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2992         if (ret)
2993                 return ret;
2994
2995         return copy_to_user((void __user *)arg, &unmap, minsz) ?
2996                         -EFAULT : 0;
2997 }
2998
2999 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
3000                                         unsigned long arg)
3001 {
3002         struct vfio_iommu_type1_dirty_bitmap dirty;
3003         uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
3004                         VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
3005                         VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
3006         unsigned long minsz;
3007         int ret = 0;
3008
3009         if (!iommu->v2)
3010                 return -EACCES;
3011
3012         minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
3013
3014         if (copy_from_user(&dirty, (void __user *)arg, minsz))
3015                 return -EFAULT;
3016
3017         if (dirty.argsz < minsz || dirty.flags & ~mask)
3018                 return -EINVAL;
3019
3020         /* only one flag should be set at a time */
3021         if (__ffs(dirty.flags) != __fls(dirty.flags))
3022                 return -EINVAL;
3023
3024         if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
3025                 size_t pgsize;
3026
3027                 mutex_lock(&iommu->lock);
3028                 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
3029                 if (!iommu->dirty_page_tracking) {
3030                         ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
3031                         if (!ret)
3032                                 iommu->dirty_page_tracking = true;
3033                 }
3034                 mutex_unlock(&iommu->lock);
3035                 return ret;
3036         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
3037                 mutex_lock(&iommu->lock);
3038                 if (iommu->dirty_page_tracking) {
3039                         iommu->dirty_page_tracking = false;
3040                         vfio_dma_bitmap_free_all(iommu);
3041                 }
3042                 mutex_unlock(&iommu->lock);
3043                 return 0;
3044         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
3045                 struct vfio_iommu_type1_dirty_bitmap_get range;
3046                 unsigned long pgshift;
3047                 size_t data_size = dirty.argsz - minsz;
3048                 size_t iommu_pgsize;
3049
3050                 if (!data_size || data_size < sizeof(range))
3051                         return -EINVAL;
3052
3053                 if (copy_from_user(&range, (void __user *)(arg + minsz),
3054                                    sizeof(range)))
3055                         return -EFAULT;
3056
3057                 if (range.iova + range.size < range.iova)
3058                         return -EINVAL;
3059                 if (!access_ok((void __user *)range.bitmap.data,
3060                                range.bitmap.size))
3061                         return -EINVAL;
3062
3063                 pgshift = __ffs(range.bitmap.pgsize);
3064                 ret = verify_bitmap_size(range.size >> pgshift,
3065                                          range.bitmap.size);
3066                 if (ret)
3067                         return ret;
3068
3069                 mutex_lock(&iommu->lock);
3070
3071                 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
3072
3073                 /* allow only smallest supported pgsize */
3074                 if (range.bitmap.pgsize != iommu_pgsize) {
3075                         ret = -EINVAL;
3076                         goto out_unlock;
3077                 }
3078                 if (range.iova & (iommu_pgsize - 1)) {
3079                         ret = -EINVAL;
3080                         goto out_unlock;
3081                 }
3082                 if (!range.size || range.size & (iommu_pgsize - 1)) {
3083                         ret = -EINVAL;
3084                         goto out_unlock;
3085                 }
3086
3087                 if (iommu->dirty_page_tracking)
3088                         ret = vfio_iova_dirty_bitmap(range.bitmap.data,
3089                                                      iommu, range.iova,
3090                                                      range.size,
3091                                                      range.bitmap.pgsize);
3092                 else
3093                         ret = -EINVAL;
3094 out_unlock:
3095                 mutex_unlock(&iommu->lock);
3096
3097                 return ret;
3098         }
3099
3100         return -EINVAL;
3101 }
3102
3103 static long vfio_iommu_type1_ioctl(void *iommu_data,
3104                                    unsigned int cmd, unsigned long arg)
3105 {
3106         struct vfio_iommu *iommu = iommu_data;
3107
3108         switch (cmd) {
3109         case VFIO_CHECK_EXTENSION:
3110                 return vfio_iommu_type1_check_extension(iommu, arg);
3111         case VFIO_IOMMU_GET_INFO:
3112                 return vfio_iommu_type1_get_info(iommu, arg);
3113         case VFIO_IOMMU_MAP_DMA:
3114                 return vfio_iommu_type1_map_dma(iommu, arg);
3115         case VFIO_IOMMU_UNMAP_DMA:
3116                 return vfio_iommu_type1_unmap_dma(iommu, arg);
3117         case VFIO_IOMMU_DIRTY_PAGES:
3118                 return vfio_iommu_type1_dirty_pages(iommu, arg);
3119         default:
3120                 return -ENOTTY;
3121         }
3122 }
3123
3124 static int vfio_iommu_type1_register_notifier(void *iommu_data,
3125                                               unsigned long *events,
3126                                               struct notifier_block *nb)
3127 {
3128         struct vfio_iommu *iommu = iommu_data;
3129
3130         /* clear known events */
3131         *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
3132
3133         /* refuse to register if still events remaining */
3134         if (*events)
3135                 return -EINVAL;
3136
3137         return blocking_notifier_chain_register(&iommu->notifier, nb);
3138 }
3139
3140 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
3141                                                 struct notifier_block *nb)
3142 {
3143         struct vfio_iommu *iommu = iommu_data;
3144
3145         return blocking_notifier_chain_unregister(&iommu->notifier, nb);
3146 }
3147
3148 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3149                                          dma_addr_t user_iova, void *data,
3150                                          size_t count, bool write,
3151                                          size_t *copied)
3152 {
3153         struct mm_struct *mm;
3154         unsigned long vaddr;
3155         struct vfio_dma *dma;
3156         bool kthread = current->mm == NULL;
3157         size_t offset;
3158         int ret;
3159
3160         *copied = 0;
3161
3162         ret = vfio_find_dma_valid(iommu, user_iova, 1, &dma);
3163         if (ret < 0)
3164                 return ret;
3165
3166         if ((write && !(dma->prot & IOMMU_WRITE)) ||
3167                         !(dma->prot & IOMMU_READ))
3168                 return -EPERM;
3169
3170         mm = get_task_mm(dma->task);
3171
3172         if (!mm)
3173                 return -EPERM;
3174
3175         if (kthread)
3176                 kthread_use_mm(mm);
3177         else if (current->mm != mm)
3178                 goto out;
3179
3180         offset = user_iova - dma->iova;
3181
3182         if (count > dma->size - offset)
3183                 count = dma->size - offset;
3184
3185         vaddr = dma->vaddr + offset;
3186
3187         if (write) {
3188                 *copied = copy_to_user((void __user *)vaddr, data,
3189                                          count) ? 0 : count;
3190                 if (*copied && iommu->dirty_page_tracking) {
3191                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3192                         /*
3193                          * Bitmap populated with the smallest supported page
3194                          * size
3195                          */
3196                         bitmap_set(dma->bitmap, offset >> pgshift,
3197                                    ((offset + *copied - 1) >> pgshift) -
3198                                    (offset >> pgshift) + 1);
3199                 }
3200         } else
3201                 *copied = copy_from_user(data, (void __user *)vaddr,
3202                                            count) ? 0 : count;
3203         if (kthread)
3204                 kthread_unuse_mm(mm);
3205 out:
3206         mmput(mm);
3207         return *copied ? 0 : -EFAULT;
3208 }
3209
3210 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3211                                    void *data, size_t count, bool write)
3212 {
3213         struct vfio_iommu *iommu = iommu_data;
3214         int ret = 0;
3215         size_t done;
3216
3217         mutex_lock(&iommu->lock);
3218         while (count > 0) {
3219                 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3220                                                     count, write, &done);
3221                 if (ret)
3222                         break;
3223
3224                 count -= done;
3225                 data += done;
3226                 user_iova += done;
3227         }
3228
3229         mutex_unlock(&iommu->lock);
3230         return ret;
3231 }
3232
3233 static struct iommu_domain *
3234 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3235                                     struct iommu_group *iommu_group)
3236 {
3237         struct iommu_domain *domain = ERR_PTR(-ENODEV);
3238         struct vfio_iommu *iommu = iommu_data;
3239         struct vfio_domain *d;
3240
3241         if (!iommu || !iommu_group)
3242                 return ERR_PTR(-EINVAL);
3243
3244         mutex_lock(&iommu->lock);
3245         list_for_each_entry(d, &iommu->domain_list, next) {
3246                 if (find_iommu_group(d, iommu_group)) {
3247                         domain = d->domain;
3248                         break;
3249                 }
3250         }
3251         mutex_unlock(&iommu->lock);
3252
3253         return domain;
3254 }
3255
3256 static void vfio_iommu_type1_notify(void *iommu_data,
3257                                     enum vfio_iommu_notify_type event)
3258 {
3259         struct vfio_iommu *iommu = iommu_data;
3260
3261         if (event != VFIO_IOMMU_CONTAINER_CLOSE)
3262                 return;
3263         mutex_lock(&iommu->lock);
3264         iommu->container_open = false;
3265         mutex_unlock(&iommu->lock);
3266         wake_up_all(&iommu->vaddr_wait);
3267 }
3268
3269 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3270         .name                   = "vfio-iommu-type1",
3271         .owner                  = THIS_MODULE,
3272         .open                   = vfio_iommu_type1_open,
3273         .release                = vfio_iommu_type1_release,
3274         .ioctl                  = vfio_iommu_type1_ioctl,
3275         .attach_group           = vfio_iommu_type1_attach_group,
3276         .detach_group           = vfio_iommu_type1_detach_group,
3277         .pin_pages              = vfio_iommu_type1_pin_pages,
3278         .unpin_pages            = vfio_iommu_type1_unpin_pages,
3279         .register_notifier      = vfio_iommu_type1_register_notifier,
3280         .unregister_notifier    = vfio_iommu_type1_unregister_notifier,
3281         .dma_rw                 = vfio_iommu_type1_dma_rw,
3282         .group_iommu_domain     = vfio_iommu_type1_group_iommu_domain,
3283         .notify                 = vfio_iommu_type1_notify,
3284 };
3285
3286 static int __init vfio_iommu_type1_init(void)
3287 {
3288         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
3289 }
3290
3291 static void __exit vfio_iommu_type1_cleanup(void)
3292 {
3293         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3294 }
3295
3296 module_init(vfio_iommu_type1_init);
3297 module_exit(vfio_iommu_type1_cleanup);
3298
3299 MODULE_VERSION(DRIVER_VERSION);
3300 MODULE_LICENSE("GPL v2");
3301 MODULE_AUTHOR(DRIVER_AUTHOR);
3302 MODULE_DESCRIPTION(DRIVER_DESC);