drivers/vfio/vfio_iommu_type1.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * VFIO: IOMMU DMA mapping support for Type1 IOMMU
   4  *
   5  * Copyright (C) 2012 Red Hat, Inc.  All rights reserved.
   6  *     Author: Alex Williamson <alex.williamson@redhat.com>
   7  *
   8  * Derived from original vfio:
   9  * Copyright 2010 Cisco Systems, Inc.  All rights reserved.
  10  * Author: Tom Lyon, pugs@cisco.com
  11  *
  12  * We arbitrarily define a Type1 IOMMU as one matching the below code.
  13  * It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
  14  * VT-d, but that makes it harder to re-use as theoretically anyone
  15  * implementing a similar IOMMU could make use of this.  We expect the
  16  * IOMMU to support the IOMMU API and have few to no restrictions around
  17  * the IOVA range that can be mapped.  The Type1 IOMMU is currently
  18  * optimized for relatively static mappings of a userspace process with
  19  * userpsace pages pinned into memory.  We also assume devices and IOMMU
  20  * domains are PCI based as the IOMMU API is still centered around a
  21  * device/bus interface rather than a group interface.
  22  */
  23
  24 #include <linux/compat.h>
  25 #include <linux/device.h>
  26 #include <linux/fs.h>
  27 #include <linux/iommu.h>
  28 #include <linux/module.h>
  29 #include <linux/mm.h>
  30 #include <linux/kthread.h>
  31 #include <linux/rbtree.h>
  32 #include <linux/sched/signal.h>
  33 #include <linux/sched/mm.h>
  34 #include <linux/slab.h>
  35 #include <linux/uaccess.h>
  36 #include <linux/vfio.h>
  37 #include <linux/workqueue.h>
  38 #include <linux/mdev.h>
  39 #include <linux/notifier.h>
  40 #include <linux/dma-iommu.h>
  41 #include <linux/irqdomain.h>
  42
  43 #define DRIVER_VERSION  "0.2"
  44 #define DRIVER_AUTHOR   "Alex Williamson <alex.williamson@redhat.com>"
  45 #define DRIVER_DESC     "Type1 IOMMU driver for VFIO"
  46
  47 static bool allow_unsafe_interrupts;
  48 module_param_named(allow_unsafe_interrupts,
  49                    allow_unsafe_interrupts, bool, S_IRUGO | S_IWUSR);
  50 MODULE_PARM_DESC(allow_unsafe_interrupts,
  51                  "Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
  52
  53 static bool disable_hugepages;
  54 module_param_named(disable_hugepages,
  55                    disable_hugepages, bool, S_IRUGO | S_IWUSR);
  56 MODULE_PARM_DESC(disable_hugepages,
  57                  "Disable VFIO IOMMU support for IOMMU hugepages.");
  58
  59 static unsigned int dma_entry_limit __read_mostly = U16_MAX;
  60 module_param_named(dma_entry_limit, dma_entry_limit, uint, 0644);
  61 MODULE_PARM_DESC(dma_entry_limit,
  62                  "Maximum number of user DMA mappings per container (65535).");
  63
  64 struct vfio_iommu {
  65         struct list_head        domain_list;
  66         struct list_head        iova_list;
  67         struct vfio_domain      *external_domain; /* domain for external user */
  68         struct mutex            lock;
  69         struct rb_root          dma_list;
  70         struct blocking_notifier_head notifier;
  71         unsigned int            dma_avail;
  72         uint64_t                pgsize_bitmap;
  73         uint64_t                num_non_pinned_groups;
  74         bool                    v2;
  75         bool                    nesting;
  76         bool                    dirty_page_tracking;
  77 };
  78
  79 struct vfio_domain {
  80         struct iommu_domain     *domain;
  81         struct list_head        next;
  82         struct list_head        group_list;
  83         int                     prot;           /* IOMMU_CACHE */
  84         bool                    fgsp;           /* Fine-grained super pages */
  85 };
  86
  87 struct vfio_dma {
  88         struct rb_node          node;
  89         dma_addr_t              iova;           /* Device address */
  90         unsigned long           vaddr;          /* Process virtual addr */
  91         size_t                  size;           /* Map size (bytes) */
  92         int                     prot;           /* IOMMU_READ/WRITE */
  93         bool                    iommu_mapped;
  94         bool                    lock_cap;       /* capable(CAP_IPC_LOCK) */
  95         struct task_struct      *task;
  96         struct rb_root          pfn_list;       /* Ex-user pinned pfn list */
  97         unsigned long           *bitmap;
  98 };
  99
 100 struct vfio_group {
 101         struct iommu_group      *iommu_group;
 102         struct list_head        next;
 103         bool                    mdev_group;     /* An mdev group */
 104         bool                    pinned_page_dirty_scope;
 105 };
 106
 107 struct vfio_iova {
 108         struct list_head        list;
 109         dma_addr_t              start;
 110         dma_addr_t              end;
 111 };
 112
 113 /*
 114  * Guest RAM pinning working set or DMA target
 115  */
 116 struct vfio_pfn {
 117         struct rb_node          node;
 118         dma_addr_t              iova;           /* Device address */
 119         unsigned long           pfn;            /* Host pfn */
 120         unsigned int            ref_count;
 121 };
 122
 123 struct vfio_regions {
 124         struct list_head list;
 125         dma_addr_t iova;
 126         phys_addr_t phys;
 127         size_t len;
 128 };
 129
 130 #define IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu) \
 131                                         (!list_empty(&iommu->domain_list))
 132
 133 #define DIRTY_BITMAP_BYTES(n)   (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
 134
 135 /*
 136  * Input argument of number of bits to bitmap_set() is unsigned integer, which
 137  * further casts to signed integer for unaligned multi-bit operation,
 138  * __bitmap_set().
 139  * Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
 140  * that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
 141  * system.
 142  */
 143 #define DIRTY_BITMAP_PAGES_MAX   ((u64)INT_MAX)
 144 #define DIRTY_BITMAP_SIZE_MAX    DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
 145
 146 static int put_pfn(unsigned long pfn, int prot);
 147
 148 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
 149                                                struct iommu_group *iommu_group);
 150
 151 /*
 152  * This code handles mapping and unmapping of user data buffers
 153  * into DMA'ble space using the IOMMU
 154  */
 155
 156 static struct vfio_dma *vfio_find_dma(struct vfio_iommu *iommu,
 157                                       dma_addr_t start, size_t size)
 158 {
 159         struct rb_node *node = iommu->dma_list.rb_node;
 160
 161         while (node) {
 162                 struct vfio_dma *dma = rb_entry(node, struct vfio_dma, node);
 163
 164                 if (start + size <= dma->iova)
 165                         node = node->rb_left;
 166                 else if (start >= dma->iova + dma->size)
 167                         node = node->rb_right;
 168                 else
 169                         return dma;
 170         }
 171
 172         return NULL;
 173 }
 174
 175 static void vfio_link_dma(struct vfio_iommu *iommu, struct vfio_dma *new)
 176 {
 177         struct rb_node **link = &iommu->dma_list.rb_node, *parent = NULL;
 178         struct vfio_dma *dma;
 179
 180         while (*link) {
 181                 parent = *link;
 182                 dma = rb_entry(parent, struct vfio_dma, node);
 183
 184                 if (new->iova + new->size <= dma->iova)
 185                         link = &(*link)->rb_left;
 186                 else
 187                         link = &(*link)->rb_right;
 188         }
 189
 190         rb_link_node(&new->node, parent, link);
 191         rb_insert_color(&new->node, &iommu->dma_list);
 192 }
 193
 194 static void vfio_unlink_dma(struct vfio_iommu *iommu, struct vfio_dma *old)
 195 {
 196         rb_erase(&old->node, &iommu->dma_list);
 197 }
 198
 199
 200 static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
 201 {
 202         uint64_t npages = dma->size / pgsize;
 203
 204         if (npages > DIRTY_BITMAP_PAGES_MAX)
 205                 return -EINVAL;
 206
 207         /*
 208          * Allocate extra 64 bits that are used to calculate shift required for
 209          * bitmap_shift_left() to manipulate and club unaligned number of pages
 210          * in adjacent vfio_dma ranges.
 211          */
 212         dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
 213                                GFP_KERNEL);
 214         if (!dma->bitmap)
 215                 return -ENOMEM;
 216
 217         return 0;
 218 }
 219
 220 static void vfio_dma_bitmap_free(struct vfio_dma *dma)
 221 {
 222         kfree(dma->bitmap);
 223         dma->bitmap = NULL;
 224 }
 225
 226 static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
 227 {
 228         struct rb_node *p;
 229         unsigned long pgshift = __ffs(pgsize);
 230
 231         for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
 232                 struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn, node);
 233
 234                 bitmap_set(dma->bitmap, (vpfn->iova - dma->iova) >> pgshift, 1);
 235         }
 236 }
 237
 238 static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
 239 {
 240         struct rb_node *n;
 241         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 242
 243         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 244                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 245
 246                 bitmap_set(dma->bitmap, 0, dma->size >> pgshift);
 247         }
 248 }
 249
 250 static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
 251 {
 252         struct rb_node *n;
 253
 254         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 255                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 256                 int ret;
 257
 258                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
 259                 if (ret) {
 260                         struct rb_node *p;
 261
 262                         for (p = rb_prev(n); p; p = rb_prev(p)) {
 263                                 struct vfio_dma *dma = rb_entry(n,
 264                                                         struct vfio_dma, node);
 265
 266                                 vfio_dma_bitmap_free(dma);
 267                         }
 268                         return ret;
 269                 }
 270                 vfio_dma_populate_bitmap(dma, pgsize);
 271         }
 272         return 0;
 273 }
 274
 275 static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
 276 {
 277         struct rb_node *n;
 278
 279         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
 280                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
 281
 282                 vfio_dma_bitmap_free(dma);
 283         }
 284 }
 285
 286 /*
 287  * Helper Functions for host iova-pfn list
 288  */
 289 static struct vfio_pfn *vfio_find_vpfn(struct vfio_dma *dma, dma_addr_t iova)
 290 {
 291         struct vfio_pfn *vpfn;
 292         struct rb_node *node = dma->pfn_list.rb_node;
 293
 294         while (node) {
 295                 vpfn = rb_entry(node, struct vfio_pfn, node);
 296
 297                 if (iova < vpfn->iova)
 298                         node = node->rb_left;
 299                 else if (iova > vpfn->iova)
 300                         node = node->rb_right;
 301                 else
 302                         return vpfn;
 303         }
 304         return NULL;
 305 }
 306
 307 static void vfio_link_pfn(struct vfio_dma *dma,
 308                           struct vfio_pfn *new)
 309 {
 310         struct rb_node **link, *parent = NULL;
 311         struct vfio_pfn *vpfn;
 312
 313         link = &dma->pfn_list.rb_node;
 314         while (*link) {
 315                 parent = *link;
 316                 vpfn = rb_entry(parent, struct vfio_pfn, node);
 317
 318                 if (new->iova < vpfn->iova)
 319                         link = &(*link)->rb_left;
 320                 else
 321                         link = &(*link)->rb_right;
 322         }
 323
 324         rb_link_node(&new->node, parent, link);
 325         rb_insert_color(&new->node, &dma->pfn_list);
 326 }
 327
 328 static void vfio_unlink_pfn(struct vfio_dma *dma, struct vfio_pfn *old)
 329 {
 330         rb_erase(&old->node, &dma->pfn_list);
 331 }
 332
 333 static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
 334                                 unsigned long pfn)
 335 {
 336         struct vfio_pfn *vpfn;
 337
 338         vpfn = kzalloc(sizeof(*vpfn), GFP_KERNEL);
 339         if (!vpfn)
 340                 return -ENOMEM;
 341
 342         vpfn->iova = iova;
 343         vpfn->pfn = pfn;
 344         vpfn->ref_count = 1;
 345         vfio_link_pfn(dma, vpfn);
 346         return 0;
 347 }
 348
 349 static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
 350                                       struct vfio_pfn *vpfn)
 351 {
 352         vfio_unlink_pfn(dma, vpfn);
 353         kfree(vpfn);
 354 }
 355
 356 static struct vfio_pfn *vfio_iova_get_vfio_pfn(struct vfio_dma *dma,
 357                                                unsigned long iova)
 358 {
 359         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 360
 361         if (vpfn)
 362                 vpfn->ref_count++;
 363         return vpfn;
 364 }
 365
 366 static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn)
 367 {
 368         int ret = 0;
 369
 370         vpfn->ref_count--;
 371         if (!vpfn->ref_count) {
 372                 ret = put_pfn(vpfn->pfn, dma->prot);
 373                 vfio_remove_from_pfn_list(dma, vpfn);
 374         }
 375         return ret;
 376 }
 377
 378 static int vfio_lock_acct(struct vfio_dma *dma, long npage, bool async)
 379 {
 380         struct mm_struct *mm;
 381         int ret;
 382
 383         if (!npage)
 384                 return 0;
 385
 386         mm = async ? get_task_mm(dma->task) : dma->task->mm;
 387         if (!mm)
 388                 return -ESRCH; /* process exited */
 389
 390         ret = mmap_write_lock_killable(mm);
 391         if (!ret) {
 392                 ret = __account_locked_vm(mm, abs(npage), npage > 0, dma->task,
 393                                           dma->lock_cap);
 394                 mmap_write_unlock(mm);
 395         }
 396
 397         if (async)
 398                 mmput(mm);
 399
 400         return ret;
 401 }
 402
 403 /*
 404  * Some mappings aren't backed by a struct page, for example an mmap'd
 405  * MMIO range for our own or another device.  These use a different
 406  * pfn conversion and shouldn't be tracked as locked pages.
 407  * For compound pages, any driver that sets the reserved bit in head
 408  * page needs to set the reserved bit in all subpages to be safe.
 409  */
 410 static bool is_invalid_reserved_pfn(unsigned long pfn)
 411 {
 412         if (pfn_valid(pfn))
 413                 return PageReserved(pfn_to_page(pfn));
 414
 415         return true;
 416 }
 417
 418 static int put_pfn(unsigned long pfn, int prot)
 419 {
 420         if (!is_invalid_reserved_pfn(pfn)) {
 421                 struct page *page = pfn_to_page(pfn);
 422
 423                 unpin_user_pages_dirty_lock(&page, 1, prot & IOMMU_WRITE);
 424                 return 1;
 425         }
 426         return 0;
 427 }
 428
 429 static int follow_fault_pfn(struct vm_area_struct *vma, struct mm_struct *mm,
 430                             unsigned long vaddr, unsigned long *pfn,
 431                             bool write_fault)
 432 {
 433         int ret;
 434
 435         ret = follow_pfn(vma, vaddr, pfn);
 436         if (ret) {
 437                 bool unlocked = false;
 438
 439                 ret = fixup_user_fault(mm, vaddr,
 440                                        FAULT_FLAG_REMOTE |
 441                                        (write_fault ?  FAULT_FLAG_WRITE : 0),
 442                                        &unlocked);
 443                 if (unlocked)
 444                         return -EAGAIN;
 445
 446                 if (ret)
 447                         return ret;
 448
 449                 ret = follow_pfn(vma, vaddr, pfn);
 450         }
 451
 452         return ret;
 453 }
 454
 455 static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr,
 456                          int prot, unsigned long *pfn)
 457 {
 458         struct page *page[1];
 459         struct vm_area_struct *vma;
 460         unsigned int flags = 0;
 461         int ret;
 462
 463         if (prot & IOMMU_WRITE)
 464                 flags |= FOLL_WRITE;
 465
 466         mmap_read_lock(mm);
 467         ret = pin_user_pages_remote(mm, vaddr, 1, flags | FOLL_LONGTERM,
 468                                     page, NULL, NULL);
 469         if (ret == 1) {
 470                 *pfn = page_to_pfn(page[0]);
 471                 ret = 0;
 472                 goto done;
 473         }
 474
 475         vaddr = untagged_addr(vaddr);
 476
 477 retry:
 478         vma = find_vma_intersection(mm, vaddr, vaddr + 1);
 479
 480         if (vma && vma->vm_flags & VM_PFNMAP) {
 481                 ret = follow_fault_pfn(vma, mm, vaddr, pfn, prot & IOMMU_WRITE);
 482                 if (ret == -EAGAIN)
 483                         goto retry;
 484
 485                 if (!ret && !is_invalid_reserved_pfn(*pfn))
 486                         ret = -EFAULT;
 487         }
 488 done:
 489         mmap_read_unlock(mm);
 490         return ret;
 491 }
 492
 493 /*
 494  * Attempt to pin pages.  We really don't want to track all the pfns and
 495  * the iommu can only map chunks of consecutive pfns anyway, so get the
 496  * first page and all consecutive pages with the same locking.
 497  */
 498 static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr,
 499                                   long npage, unsigned long *pfn_base,
 500                                   unsigned long limit)
 501 {
 502         unsigned long pfn = 0;
 503         long ret, pinned = 0, lock_acct = 0;
 504         bool rsvd;
 505         dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
 506
 507         /* This code path is only user initiated */
 508         if (!current->mm)
 509                 return -ENODEV;
 510
 511         ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, pfn_base);
 512         if (ret)
 513                 return ret;
 514
 515         pinned++;
 516         rsvd = is_invalid_reserved_pfn(*pfn_base);
 517
 518         /*
 519          * Reserved pages aren't counted against the user, externally pinned
 520          * pages are already counted against the user.
 521          */
 522         if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 523                 if (!dma->lock_cap && current->mm->locked_vm + 1 > limit) {
 524                         put_pfn(*pfn_base, dma->prot);
 525                         pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__,
 526                                         limit << PAGE_SHIFT);
 527                         return -ENOMEM;
 528                 }
 529                 lock_acct++;
 530         }
 531
 532         if (unlikely(disable_hugepages))
 533                 goto out;
 534
 535         /* Lock all the consecutive pages from pfn_base */
 536         for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage;
 537              pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) {
 538                 ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn);
 539                 if (ret)
 540                         break;
 541
 542                 if (pfn != *pfn_base + pinned ||
 543                     rsvd != is_invalid_reserved_pfn(pfn)) {
 544                         put_pfn(pfn, dma->prot);
 545                         break;
 546                 }
 547
 548                 if (!rsvd && !vfio_find_vpfn(dma, iova)) {
 549                         if (!dma->lock_cap &&
 550                             current->mm->locked_vm + lock_acct + 1 > limit) {
 551                                 put_pfn(pfn, dma->prot);
 552                                 pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
 553                                         __func__, limit << PAGE_SHIFT);
 554                                 ret = -ENOMEM;
 555                                 goto unpin_out;
 556                         }
 557                         lock_acct++;
 558                 }
 559         }
 560
 561 out:
 562         ret = vfio_lock_acct(dma, lock_acct, false);
 563
 564 unpin_out:
 565         if (ret) {
 566                 if (!rsvd) {
 567                         for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
 568                                 put_pfn(pfn, dma->prot);
 569                 }
 570
 571                 return ret;
 572         }
 573
 574         return pinned;
 575 }
 576
 577 static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
 578                                     unsigned long pfn, long npage,
 579                                     bool do_accounting)
 580 {
 581         long unlocked = 0, locked = 0;
 582         long i;
 583
 584         for (i = 0; i < npage; i++, iova += PAGE_SIZE) {
 585                 if (put_pfn(pfn++, dma->prot)) {
 586                         unlocked++;
 587                         if (vfio_find_vpfn(dma, iova))
 588                                 locked++;
 589                 }
 590         }
 591
 592         if (do_accounting)
 593                 vfio_lock_acct(dma, locked - unlocked, true);
 594
 595         return unlocked;
 596 }
 597
 598 static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr,
 599                                   unsigned long *pfn_base, bool do_accounting)
 600 {
 601         struct mm_struct *mm;
 602         int ret;
 603
 604         mm = get_task_mm(dma->task);
 605         if (!mm)
 606                 return -ENODEV;
 607
 608         ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base);
 609         if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) {
 610                 ret = vfio_lock_acct(dma, 1, true);
 611                 if (ret) {
 612                         put_pfn(*pfn_base, dma->prot);
 613                         if (ret == -ENOMEM)
 614                                 pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
 615                                         "(%ld) exceeded\n", __func__,
 616                                         dma->task->comm, task_pid_nr(dma->task),
 617                                         task_rlimit(dma->task, RLIMIT_MEMLOCK));
 618                 }
 619         }
 620
 621         mmput(mm);
 622         return ret;
 623 }
 624
 625 static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
 626                                     bool do_accounting)
 627 {
 628         int unlocked;
 629         struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
 630
 631         if (!vpfn)
 632                 return 0;
 633
 634         unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
 635
 636         if (do_accounting)
 637                 vfio_lock_acct(dma, -unlocked, true);
 638
 639         return unlocked;
 640 }
 641
 642 static int vfio_iommu_type1_pin_pages(void *iommu_data,
 643                                       struct iommu_group *iommu_group,
 644                                       unsigned long *user_pfn,
 645                                       int npage, int prot,
 646                                       unsigned long *phys_pfn)
 647 {
 648         struct vfio_iommu *iommu = iommu_data;
 649         struct vfio_group *group;
 650         int i, j, ret;
 651         unsigned long remote_vaddr;
 652         struct vfio_dma *dma;
 653         bool do_accounting;
 654
 655         if (!iommu || !user_pfn || !phys_pfn)
 656                 return -EINVAL;
 657
 658         /* Supported for v2 version only */
 659         if (!iommu->v2)
 660                 return -EACCES;
 661
 662         mutex_lock(&iommu->lock);
 663
 664         /* Fail if notifier list is empty */
 665         if (!iommu->notifier.head) {
 666                 ret = -EINVAL;
 667                 goto pin_done;
 668         }
 669
 670         /*
 671          * If iommu capable domain exist in the container then all pages are
 672          * already pinned and accounted. Accouting should be done if there is no
 673          * iommu capable domain in the container.
 674          */
 675         do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 676
 677         for (i = 0; i < npage; i++) {
 678                 dma_addr_t iova;
 679                 struct vfio_pfn *vpfn;
 680
 681                 iova = user_pfn[i] << PAGE_SHIFT;
 682                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 683                 if (!dma) {
 684                         ret = -EINVAL;
 685                         goto pin_unwind;
 686                 }
 687
 688                 if ((dma->prot & prot) != prot) {
 689                         ret = -EPERM;
 690                         goto pin_unwind;
 691                 }
 692
 693                 vpfn = vfio_iova_get_vfio_pfn(dma, iova);
 694                 if (vpfn) {
 695                         phys_pfn[i] = vpfn->pfn;
 696                         continue;
 697                 }
 698
 699                 remote_vaddr = dma->vaddr + (iova - dma->iova);
 700                 ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i],
 701                                              do_accounting);
 702                 if (ret)
 703                         goto pin_unwind;
 704
 705                 ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]);
 706                 if (ret) {
 707                         if (put_pfn(phys_pfn[i], dma->prot) && do_accounting)
 708                                 vfio_lock_acct(dma, -1, true);
 709                         goto pin_unwind;
 710                 }
 711
 712                 if (iommu->dirty_page_tracking) {
 713                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
 714
 715                         /*
 716                          * Bitmap populated with the smallest supported page
 717                          * size
 718                          */
 719                         bitmap_set(dma->bitmap,
 720                                    (iova - dma->iova) >> pgshift, 1);
 721                 }
 722         }
 723         ret = i;
 724
 725         group = vfio_iommu_find_iommu_group(iommu, iommu_group);
 726         if (!group->pinned_page_dirty_scope) {
 727                 group->pinned_page_dirty_scope = true;
 728                 iommu->num_non_pinned_groups--;
 729         }
 730
 731         goto pin_done;
 732
 733 pin_unwind:
 734         phys_pfn[i] = 0;
 735         for (j = 0; j < i; j++) {
 736                 dma_addr_t iova;
 737
 738                 iova = user_pfn[j] << PAGE_SHIFT;
 739                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 740                 vfio_unpin_page_external(dma, iova, do_accounting);
 741                 phys_pfn[j] = 0;
 742         }
 743 pin_done:
 744         mutex_unlock(&iommu->lock);
 745         return ret;
 746 }
 747
 748 static int vfio_iommu_type1_unpin_pages(void *iommu_data,
 749                                         unsigned long *user_pfn,
 750                                         int npage)
 751 {
 752         struct vfio_iommu *iommu = iommu_data;
 753         bool do_accounting;
 754         int i;
 755
 756         if (!iommu || !user_pfn)
 757                 return -EINVAL;
 758
 759         /* Supported for v2 version only */
 760         if (!iommu->v2)
 761                 return -EACCES;
 762
 763         mutex_lock(&iommu->lock);
 764
 765         do_accounting = !IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu);
 766         for (i = 0; i < npage; i++) {
 767                 struct vfio_dma *dma;
 768                 dma_addr_t iova;
 769
 770                 iova = user_pfn[i] << PAGE_SHIFT;
 771                 dma = vfio_find_dma(iommu, iova, PAGE_SIZE);
 772                 if (!dma)
 773                         goto unpin_exit;
 774                 vfio_unpin_page_external(dma, iova, do_accounting);
 775         }
 776
 777 unpin_exit:
 778         mutex_unlock(&iommu->lock);
 779         return i > npage ? npage : (i > 0 ? i : -EINVAL);
 780 }
 781
 782 static long vfio_sync_unpin(struct vfio_dma *dma, struct vfio_domain *domain,
 783                             struct list_head *regions,
 784                             struct iommu_iotlb_gather *iotlb_gather)
 785 {
 786         long unlocked = 0;
 787         struct vfio_regions *entry, *next;
 788
 789         iommu_iotlb_sync(domain->domain, iotlb_gather);
 790
 791         list_for_each_entry_safe(entry, next, regions, list) {
 792                 unlocked += vfio_unpin_pages_remote(dma,
 793                                                     entry->iova,
 794                                                     entry->phys >> PAGE_SHIFT,
 795                                                     entry->len >> PAGE_SHIFT,
 796                                                     false);
 797                 list_del(&entry->list);
 798                 kfree(entry);
 799         }
 800
 801         cond_resched();
 802
 803         return unlocked;
 804 }
 805
 806 /*
 807  * Generally, VFIO needs to unpin remote pages after each IOTLB flush.
 808  * Therefore, when using IOTLB flush sync interface, VFIO need to keep track
 809  * of these regions (currently using a list).
 810  *
 811  * This value specifies maximum number of regions for each IOTLB flush sync.
 812  */
 813 #define VFIO_IOMMU_TLB_SYNC_MAX         512
 814
 815 static size_t unmap_unpin_fast(struct vfio_domain *domain,
 816                                struct vfio_dma *dma, dma_addr_t *iova,
 817                                size_t len, phys_addr_t phys, long *unlocked,
 818                                struct list_head *unmapped_list,
 819                                int *unmapped_cnt,
 820                                struct iommu_iotlb_gather *iotlb_gather)
 821 {
 822         size_t unmapped = 0;
 823         struct vfio_regions *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
 824
 825         if (entry) {
 826                 unmapped = iommu_unmap_fast(domain->domain, *iova, len,
 827                                             iotlb_gather);
 828
 829                 if (!unmapped) {
 830                         kfree(entry);
 831                 } else {
 832                         entry->iova = *iova;
 833                         entry->phys = phys;
 834                         entry->len  = unmapped;
 835                         list_add_tail(&entry->list, unmapped_list);
 836
 837                         *iova += unmapped;
 838                         (*unmapped_cnt)++;
 839                 }
 840         }
 841
 842         /*
 843          * Sync if the number of fast-unmap regions hits the limit
 844          * or in case of errors.
 845          */
 846         if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX || !unmapped) {
 847                 *unlocked += vfio_sync_unpin(dma, domain, unmapped_list,
 848                                              iotlb_gather);
 849                 *unmapped_cnt = 0;
 850         }
 851
 852         return unmapped;
 853 }
 854
 855 static size_t unmap_unpin_slow(struct vfio_domain *domain,
 856                                struct vfio_dma *dma, dma_addr_t *iova,
 857                                size_t len, phys_addr_t phys,
 858                                long *unlocked)
 859 {
 860         size_t unmapped = iommu_unmap(domain->domain, *iova, len);
 861
 862         if (unmapped) {
 863                 *unlocked += vfio_unpin_pages_remote(dma, *iova,
 864                                                      phys >> PAGE_SHIFT,
 865                                                      unmapped >> PAGE_SHIFT,
 866                                                      false);
 867                 *iova += unmapped;
 868                 cond_resched();
 869         }
 870         return unmapped;
 871 }
 872
 873 static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma,
 874                              bool do_accounting)
 875 {
 876         dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
 877         struct vfio_domain *domain, *d;
 878         LIST_HEAD(unmapped_region_list);
 879         struct iommu_iotlb_gather iotlb_gather;
 880         int unmapped_region_cnt = 0;
 881         long unlocked = 0;
 882
 883         if (!dma->size)
 884                 return 0;
 885
 886         if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
 887                 return 0;
 888
 889         /*
 890          * We use the IOMMU to track the physical addresses, otherwise we'd
 891          * need a much more complicated tracking system.  Unfortunately that
 892          * means we need to use one of the iommu domains to figure out the
 893          * pfns to unpin.  The rest need to be unmapped in advance so we have
 894          * no iommu translations remaining when the pages are unpinned.
 895          */
 896         domain = d = list_first_entry(&iommu->domain_list,
 897                                       struct vfio_domain, next);
 898
 899         list_for_each_entry_continue(d, &iommu->domain_list, next) {
 900                 iommu_unmap(d->domain, dma->iova, dma->size);
 901                 cond_resched();
 902         }
 903
 904         iommu_iotlb_gather_init(&iotlb_gather);
 905         while (iova < end) {
 906                 size_t unmapped, len;
 907                 phys_addr_t phys, next;
 908
 909                 phys = iommu_iova_to_phys(domain->domain, iova);
 910                 if (WARN_ON(!phys)) {
 911                         iova += PAGE_SIZE;
 912                         continue;
 913                 }
 914
 915                 /*
 916                  * To optimize for fewer iommu_unmap() calls, each of which
 917                  * may require hardware cache flushing, try to find the
 918                  * largest contiguous physical memory chunk to unmap.
 919                  */
 920                 for (len = PAGE_SIZE;
 921                      !domain->fgsp && iova + len < end; len += PAGE_SIZE) {
 922                         next = iommu_iova_to_phys(domain->domain, iova + len);
 923                         if (next != phys + len)
 924                                 break;
 925                 }
 926
 927                 /*
 928                  * First, try to use fast unmap/unpin. In case of failure,
 929                  * switch to slow unmap/unpin path.
 930                  */
 931                 unmapped = unmap_unpin_fast(domain, dma, &iova, len, phys,
 932                                             &unlocked, &unmapped_region_list,
 933                                             &unmapped_region_cnt,
 934                                             &iotlb_gather);
 935                 if (!unmapped) {
 936                         unmapped = unmap_unpin_slow(domain, dma, &iova, len,
 937                                                     phys, &unlocked);
 938                         if (WARN_ON(!unmapped))
 939                                 break;
 940                 }
 941         }
 942
 943         dma->iommu_mapped = false;
 944
 945         if (unmapped_region_cnt) {
 946                 unlocked += vfio_sync_unpin(dma, domain, &unmapped_region_list,
 947                                             &iotlb_gather);
 948         }
 949
 950         if (do_accounting) {
 951                 vfio_lock_acct(dma, -unlocked, true);
 952                 return 0;
 953         }
 954         return unlocked;
 955 }
 956
 957 static void vfio_remove_dma(struct vfio_iommu *iommu, struct vfio_dma *dma)
 958 {
 959         WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
 960         vfio_unmap_unpin(iommu, dma, true);
 961         vfio_unlink_dma(iommu, dma);
 962         put_task_struct(dma->task);
 963         vfio_dma_bitmap_free(dma);
 964         kfree(dma);
 965         iommu->dma_avail++;
 966 }
 967
 968 static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
 969 {
 970         struct vfio_domain *domain;
 971
 972         iommu->pgsize_bitmap = ULONG_MAX;
 973
 974         list_for_each_entry(domain, &iommu->domain_list, next)
 975                 iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
 976
 977         /*
 978          * In case the IOMMU supports page sizes smaller than PAGE_SIZE
 979          * we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
 980          * That way the user will be able to map/unmap buffers whose size/
 981          * start address is aligned with PAGE_SIZE. Pinning code uses that
 982          * granularity while iommu driver can use the sub-PAGE_SIZE size
 983          * to map the buffer.
 984          */
 985         if (iommu->pgsize_bitmap & ~PAGE_MASK) {
 986                 iommu->pgsize_bitmap &= PAGE_MASK;
 987                 iommu->pgsize_bitmap |= PAGE_SIZE;
 988         }
 989 }
 990
 991 static int update_user_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
 992                               struct vfio_dma *dma, dma_addr_t base_iova,
 993                               size_t pgsize)
 994 {
 995         unsigned long pgshift = __ffs(pgsize);
 996         unsigned long nbits = dma->size >> pgshift;
 997         unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
 998         unsigned long copy_offset = bit_offset / BITS_PER_LONG;
 999         unsigned long shift = bit_offset % BITS_PER_LONG;
1000         unsigned long leftover;
1001
1002         /*
1003          * mark all pages dirty if any IOMMU capable device is not able
1004          * to report dirty pages and all pages are pinned and mapped.
1005          */
1006         if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1007                 bitmap_set(dma->bitmap, 0, nbits);
1008
1009         if (shift) {
1010                 bitmap_shift_left(dma->bitmap, dma->bitmap, shift,
1011                                   nbits + shift);
1012
1013                 if (copy_from_user(&leftover,
1014                                    (void __user *)(bitmap + copy_offset),
1015                                    sizeof(leftover)))
1016                         return -EFAULT;
1017
1018                 bitmap_or(dma->bitmap, dma->bitmap, &leftover, shift);
1019         }
1020
1021         if (copy_to_user((void __user *)(bitmap + copy_offset), dma->bitmap,
1022                          DIRTY_BITMAP_BYTES(nbits + shift)))
1023                 return -EFAULT;
1024
1025         return 0;
1026 }
1027
1028 static int vfio_iova_dirty_bitmap(u64 __user *bitmap, struct vfio_iommu *iommu,
1029                                   dma_addr_t iova, size_t size, size_t pgsize)
1030 {
1031         struct vfio_dma *dma;
1032         struct rb_node *n;
1033         unsigned long pgshift = __ffs(pgsize);
1034         int ret;
1035
1036         /*
1037          * GET_BITMAP request must fully cover vfio_dma mappings.  Multiple
1038          * vfio_dma mappings may be clubbed by specifying large ranges, but
1039          * there must not be any previous mappings bisected by the range.
1040          * An error will be returned if these conditions are not met.
1041          */
1042         dma = vfio_find_dma(iommu, iova, 1);
1043         if (dma && dma->iova != iova)
1044                 return -EINVAL;
1045
1046         dma = vfio_find_dma(iommu, iova + size - 1, 0);
1047         if (dma && dma->iova + dma->size != iova + size)
1048                 return -EINVAL;
1049
1050         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1051                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1052
1053                 if (dma->iova < iova)
1054                         continue;
1055
1056                 if (dma->iova > iova + size - 1)
1057                         break;
1058
1059                 ret = update_user_bitmap(bitmap, iommu, dma, iova, pgsize);
1060                 if (ret)
1061                         return ret;
1062
1063                 /*
1064                  * Re-populate bitmap to include all pinned pages which are
1065                  * considered as dirty but exclude pages which are unpinned and
1066                  * pages which are marked dirty by vfio_dma_rw()
1067                  */
1068                 bitmap_clear(dma->bitmap, 0, dma->size >> pgshift);
1069                 vfio_dma_populate_bitmap(dma, pgsize);
1070         }
1071         return 0;
1072 }
1073
1074 static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1075 {
1076         if (!npages || !bitmap_size || (bitmap_size > DIRTY_BITMAP_SIZE_MAX) ||
1077             (bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1078                 return -EINVAL;
1079
1080         return 0;
1081 }
1082
1083 static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1084                              struct vfio_iommu_type1_dma_unmap *unmap,
1085                              struct vfio_bitmap *bitmap)
1086 {
1087         struct vfio_dma *dma, *dma_last = NULL;
1088         size_t unmapped = 0, pgsize;
1089         int ret = 0, retries = 0;
1090         unsigned long pgshift;
1091
1092         mutex_lock(&iommu->lock);
1093
1094         pgshift = __ffs(iommu->pgsize_bitmap);
1095         pgsize = (size_t)1 << pgshift;
1096
1097         if (unmap->iova & (pgsize - 1)) {
1098                 ret = -EINVAL;
1099                 goto unlock;
1100         }
1101
1102         if (!unmap->size || unmap->size & (pgsize - 1)) {
1103                 ret = -EINVAL;
1104                 goto unlock;
1105         }
1106
1107         if (unmap->iova + unmap->size - 1 < unmap->iova ||
1108             unmap->size > SIZE_MAX) {
1109                 ret = -EINVAL;
1110                 goto unlock;
1111         }
1112
1113         /* When dirty tracking is enabled, allow only min supported pgsize */
1114         if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1115             (!iommu->dirty_page_tracking || (bitmap->pgsize != pgsize))) {
1116                 ret = -EINVAL;
1117                 goto unlock;
1118         }
1119
1120         WARN_ON((pgsize - 1) & PAGE_MASK);
1121 again:
1122         /*
1123          * vfio-iommu-type1 (v1) - User mappings were coalesced together to
1124          * avoid tracking individual mappings.  This means that the granularity
1125          * of the original mapping was lost and the user was allowed to attempt
1126          * to unmap any range.  Depending on the contiguousness of physical
1127          * memory and page sizes supported by the IOMMU, arbitrary unmaps may
1128          * or may not have worked.  We only guaranteed unmap granularity
1129          * matching the original mapping; even though it was untracked here,
1130          * the original mappings are reflected in IOMMU mappings.  This
1131          * resulted in a couple unusual behaviors.  First, if a range is not
1132          * able to be unmapped, ex. a set of 4k pages that was mapped as a
1133          * 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1134          * a zero sized unmap.  Also, if an unmap request overlaps the first
1135          * address of a hugepage, the IOMMU will unmap the entire hugepage.
1136          * This also returns success and the returned unmap size reflects the
1137          * actual size unmapped.
1138          *
1139          * We attempt to maintain compatibility with this "v1" interface, but
1140          * we take control out of the hands of the IOMMU.  Therefore, an unmap
1141          * request offset from the beginning of the original mapping will
1142          * return success with zero sized unmap.  And an unmap request covering
1143          * the first iova of mapping will unmap the entire range.
1144          *
1145          * The v2 version of this interface intends to be more deterministic.
1146          * Unmap requests must fully cover previous mappings.  Multiple
1147          * mappings may still be unmaped by specifying large ranges, but there
1148          * must not be any previous mappings bisected by the range.  An error
1149          * will be returned if these conditions are not met.  The v2 interface
1150          * will only return success and a size of zero if there were no
1151          * mappings within the range.
1152          */
1153         if (iommu->v2) {
1154                 dma = vfio_find_dma(iommu, unmap->iova, 1);
1155                 if (dma && dma->iova != unmap->iova) {
1156                         ret = -EINVAL;
1157                         goto unlock;
1158                 }
1159                 dma = vfio_find_dma(iommu, unmap->iova + unmap->size - 1, 0);
1160                 if (dma && dma->iova + dma->size != unmap->iova + unmap->size) {
1161                         ret = -EINVAL;
1162                         goto unlock;
1163                 }
1164         }
1165
1166         while ((dma = vfio_find_dma(iommu, unmap->iova, unmap->size))) {
1167                 if (!iommu->v2 && unmap->iova > dma->iova)
1168                         break;
1169                 /*
1170                  * Task with same address space who mapped this iova range is
1171                  * allowed to unmap the iova range.
1172                  */
1173                 if (dma->task->mm != current->mm)
1174                         break;
1175
1176                 if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1177                         struct vfio_iommu_type1_dma_unmap nb_unmap;
1178
1179                         if (dma_last == dma) {
1180                                 BUG_ON(++retries > 10);
1181                         } else {
1182                                 dma_last = dma;
1183                                 retries = 0;
1184                         }
1185
1186                         nb_unmap.iova = dma->iova;
1187                         nb_unmap.size = dma->size;
1188
1189                         /*
1190                          * Notify anyone (mdev vendor drivers) to invalidate and
1191                          * unmap iovas within the range we're about to unmap.
1192                          * Vendor drivers MUST unpin pages in response to an
1193                          * invalidation.
1194                          */
1195                         mutex_unlock(&iommu->lock);
1196                         blocking_notifier_call_chain(&iommu->notifier,
1197                                                     VFIO_IOMMU_NOTIFY_DMA_UNMAP,
1198                                                     &nb_unmap);
1199                         mutex_lock(&iommu->lock);
1200                         goto again;
1201                 }
1202
1203                 if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1204                         ret = update_user_bitmap(bitmap->data, iommu, dma,
1205                                                  unmap->iova, pgsize);
1206                         if (ret)
1207                                 break;
1208                 }
1209
1210                 unmapped += dma->size;
1211                 vfio_remove_dma(iommu, dma);
1212         }
1213
1214 unlock:
1215         mutex_unlock(&iommu->lock);
1216
1217         /* Report how much was unmapped */
1218         unmap->size = unmapped;
1219
1220         return ret;
1221 }
1222
1223 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1224                           unsigned long pfn, long npage, int prot)
1225 {
1226         struct vfio_domain *d;
1227         int ret;
1228
1229         list_for_each_entry(d, &iommu->domain_list, next) {
1230                 ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
1231                                 npage << PAGE_SHIFT, prot | d->prot);
1232                 if (ret)
1233                         goto unwind;
1234
1235                 cond_resched();
1236         }
1237
1238         return 0;
1239
1240 unwind:
1241         list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1242                 iommu_unmap(d->domain, iova, npage << PAGE_SHIFT);
1243                 cond_resched();
1244         }
1245
1246         return ret;
1247 }
1248
1249 static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma,
1250                             size_t map_size)
1251 {
1252         dma_addr_t iova = dma->iova;
1253         unsigned long vaddr = dma->vaddr;
1254         size_t size = map_size;
1255         long npage;
1256         unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1257         int ret = 0;
1258
1259         while (size) {
1260                 /* Pin a contiguous chunk of memory */
1261                 npage = vfio_pin_pages_remote(dma, vaddr + dma->size,
1262                                               size >> PAGE_SHIFT, &pfn, limit);
1263                 if (npage <= 0) {
1264                         WARN_ON(!npage);
1265                         ret = (int)npage;
1266                         break;
1267                 }
1268
1269                 /* Map it! */
1270                 ret = vfio_iommu_map(iommu, iova + dma->size, pfn, npage,
1271                                      dma->prot);
1272                 if (ret) {
1273                         vfio_unpin_pages_remote(dma, iova + dma->size, pfn,
1274                                                 npage, true);
1275                         break;
1276                 }
1277
1278                 size -= npage << PAGE_SHIFT;
1279                 dma->size += npage << PAGE_SHIFT;
1280         }
1281
1282         dma->iommu_mapped = true;
1283
1284         if (ret)
1285                 vfio_remove_dma(iommu, dma);
1286
1287         return ret;
1288 }
1289
1290 /*
1291  * Check dma map request is within a valid iova range
1292  */
1293 static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1294                                       dma_addr_t start, dma_addr_t end)
1295 {
1296         struct list_head *iova = &iommu->iova_list;
1297         struct vfio_iova *node;
1298
1299         list_for_each_entry(node, iova, list) {
1300                 if (start >= node->start && end <= node->end)
1301                         return true;
1302         }
1303
1304         /*
1305          * Check for list_empty() as well since a container with
1306          * a single mdev device will have an empty list.
1307          */
1308         return list_empty(iova);
1309 }
1310
1311 static int vfio_dma_do_map(struct vfio_iommu *iommu,
1312                            struct vfio_iommu_type1_dma_map *map)
1313 {
1314         dma_addr_t iova = map->iova;
1315         unsigned long vaddr = map->vaddr;
1316         size_t size = map->size;
1317         int ret = 0, prot = 0;
1318         size_t pgsize;
1319         struct vfio_dma *dma;
1320
1321         /* Verify that none of our __u64 fields overflow */
1322         if (map->size != size || map->vaddr != vaddr || map->iova != iova)
1323                 return -EINVAL;
1324
1325         /* READ/WRITE from device perspective */
1326         if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1327                 prot |= IOMMU_WRITE;
1328         if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1329                 prot |= IOMMU_READ;
1330
1331         mutex_lock(&iommu->lock);
1332
1333         pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
1334
1335         WARN_ON((pgsize - 1) & PAGE_MASK);
1336
1337         if (!prot || !size || (size | iova | vaddr) & (pgsize - 1)) {
1338                 ret = -EINVAL;
1339                 goto out_unlock;
1340         }
1341
1342         /* Don't allow IOVA or virtual address wrap */
1343         if (iova + size - 1 < iova || vaddr + size - 1 < vaddr) {
1344                 ret = -EINVAL;
1345                 goto out_unlock;
1346         }
1347
1348         if (vfio_find_dma(iommu, iova, size)) {
1349                 ret = -EEXIST;
1350                 goto out_unlock;
1351         }
1352
1353         if (!iommu->dma_avail) {
1354                 ret = -ENOSPC;
1355                 goto out_unlock;
1356         }
1357
1358         if (!vfio_iommu_iova_dma_valid(iommu, iova, iova + size - 1)) {
1359                 ret = -EINVAL;
1360                 goto out_unlock;
1361         }
1362
1363         dma = kzalloc(sizeof(*dma), GFP_KERNEL);
1364         if (!dma) {
1365                 ret = -ENOMEM;
1366                 goto out_unlock;
1367         }
1368
1369         iommu->dma_avail--;
1370         dma->iova = iova;
1371         dma->vaddr = vaddr;
1372         dma->prot = prot;
1373
1374         /*
1375          * We need to be able to both add to a task's locked memory and test
1376          * against the locked memory limit and we need to be able to do both
1377          * outside of this call path as pinning can be asynchronous via the
1378          * external interfaces for mdev devices.  RLIMIT_MEMLOCK requires a
1379          * task_struct and VM locked pages requires an mm_struct, however
1380          * holding an indefinite mm reference is not recommended, therefore we
1381          * only hold a reference to a task.  We could hold a reference to
1382          * current, however QEMU uses this call path through vCPU threads,
1383          * which can be killed resulting in a NULL mm and failure in the unmap
1384          * path when called via a different thread.  Avoid this problem by
1385          * using the group_leader as threads within the same group require
1386          * both CLONE_THREAD and CLONE_VM and will therefore use the same
1387          * mm_struct.
1388          *
1389          * Previously we also used the task for testing CAP_IPC_LOCK at the
1390          * time of pinning and accounting, however has_capability() makes use
1391          * of real_cred, a copy-on-write field, so we can't guarantee that it
1392          * matches group_leader, or in fact that it might not change by the
1393          * time it's evaluated.  If a process were to call MAP_DMA with
1394          * CAP_IPC_LOCK but later drop it, it doesn't make sense that they
1395          * possibly see different results for an iommu_mapped vfio_dma vs
1396          * externally mapped.  Therefore track CAP_IPC_LOCK in vfio_dma at the
1397          * time of calling MAP_DMA.
1398          */
1399         get_task_struct(current->group_leader);
1400         dma->task = current->group_leader;
1401         dma->lock_cap = capable(CAP_IPC_LOCK);
1402
1403         dma->pfn_list = RB_ROOT;
1404
1405         /* Insert zero-sized and grow as we map chunks of it */
1406         vfio_link_dma(iommu, dma);
1407
1408         /* Don't pin and map if container doesn't contain IOMMU capable domain*/
1409         if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu))
1410                 dma->size = size;
1411         else
1412                 ret = vfio_pin_map_dma(iommu, dma, size);
1413
1414         if (!ret && iommu->dirty_page_tracking) {
1415                 ret = vfio_dma_bitmap_alloc(dma, pgsize);
1416                 if (ret)
1417                         vfio_remove_dma(iommu, dma);
1418         }
1419
1420 out_unlock:
1421         mutex_unlock(&iommu->lock);
1422         return ret;
1423 }
1424
1425 static int vfio_bus_type(struct device *dev, void *data)
1426 {
1427         struct bus_type **bus = data;
1428
1429         if (*bus && *bus != dev->bus)
1430                 return -EINVAL;
1431
1432         *bus = dev->bus;
1433
1434         return 0;
1435 }
1436
1437 static int vfio_iommu_replay(struct vfio_iommu *iommu,
1438                              struct vfio_domain *domain)
1439 {
1440         struct vfio_domain *d = NULL;
1441         struct rb_node *n;
1442         unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1443         int ret;
1444
1445         /* Arbitrarily pick the first domain in the list for lookups */
1446         if (!list_empty(&iommu->domain_list))
1447                 d = list_first_entry(&iommu->domain_list,
1448                                      struct vfio_domain, next);
1449
1450         n = rb_first(&iommu->dma_list);
1451
1452         for (; n; n = rb_next(n)) {
1453                 struct vfio_dma *dma;
1454                 dma_addr_t iova;
1455
1456                 dma = rb_entry(n, struct vfio_dma, node);
1457                 iova = dma->iova;
1458
1459                 while (iova < dma->iova + dma->size) {
1460                         phys_addr_t phys;
1461                         size_t size;
1462
1463                         if (dma->iommu_mapped) {
1464                                 phys_addr_t p;
1465                                 dma_addr_t i;
1466
1467                                 if (WARN_ON(!d)) { /* mapped w/o a domain?! */
1468                                         ret = -EINVAL;
1469                                         goto unwind;
1470                                 }
1471
1472                                 phys = iommu_iova_to_phys(d->domain, iova);
1473
1474                                 if (WARN_ON(!phys)) {
1475                                         iova += PAGE_SIZE;
1476                                         continue;
1477                                 }
1478
1479                                 size = PAGE_SIZE;
1480                                 p = phys + size;
1481                                 i = iova + size;
1482                                 while (i < dma->iova + dma->size &&
1483                                        p == iommu_iova_to_phys(d->domain, i)) {
1484                                         size += PAGE_SIZE;
1485                                         p += PAGE_SIZE;
1486                                         i += PAGE_SIZE;
1487                                 }
1488                         } else {
1489                                 unsigned long pfn;
1490                                 unsigned long vaddr = dma->vaddr +
1491                                                      (iova - dma->iova);
1492                                 size_t n = dma->iova + dma->size - iova;
1493                                 long npage;
1494
1495                                 npage = vfio_pin_pages_remote(dma, vaddr,
1496                                                               n >> PAGE_SHIFT,
1497                                                               &pfn, limit);
1498                                 if (npage <= 0) {
1499                                         WARN_ON(!npage);
1500                                         ret = (int)npage;
1501                                         goto unwind;
1502                                 }
1503
1504                                 phys = pfn << PAGE_SHIFT;
1505                                 size = npage << PAGE_SHIFT;
1506                         }
1507
1508                         ret = iommu_map(domain->domain, iova, phys,
1509                                         size, dma->prot | domain->prot);
1510                         if (ret) {
1511                                 if (!dma->iommu_mapped)
1512                                         vfio_unpin_pages_remote(dma, iova,
1513                                                         phys >> PAGE_SHIFT,
1514                                                         size >> PAGE_SHIFT,
1515                                                         true);
1516                                 goto unwind;
1517                         }
1518
1519                         iova += size;
1520                 }
1521         }
1522
1523         /* All dmas are now mapped, defer to second tree walk for unwind */
1524         for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1525                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1526
1527                 dma->iommu_mapped = true;
1528         }
1529
1530         return 0;
1531
1532 unwind:
1533         for (; n; n = rb_prev(n)) {
1534                 struct vfio_dma *dma = rb_entry(n, struct vfio_dma, node);
1535                 dma_addr_t iova;
1536
1537                 if (dma->iommu_mapped) {
1538                         iommu_unmap(domain->domain, dma->iova, dma->size);
1539                         continue;
1540                 }
1541
1542                 iova = dma->iova;
1543                 while (iova < dma->iova + dma->size) {
1544                         phys_addr_t phys, p;
1545                         size_t size;
1546                         dma_addr_t i;
1547
1548                         phys = iommu_iova_to_phys(domain->domain, iova);
1549                         if (!phys) {
1550                                 iova += PAGE_SIZE;
1551                                 continue;
1552                         }
1553
1554                         size = PAGE_SIZE;
1555                         p = phys + size;
1556                         i = iova + size;
1557                         while (i < dma->iova + dma->size &&
1558                                p == iommu_iova_to_phys(domain->domain, i)) {
1559                                 size += PAGE_SIZE;
1560                                 p += PAGE_SIZE;
1561                                 i += PAGE_SIZE;
1562                         }
1563
1564                         iommu_unmap(domain->domain, iova, size);
1565                         vfio_unpin_pages_remote(dma, iova, phys >> PAGE_SHIFT,
1566                                                 size >> PAGE_SHIFT, true);
1567                 }
1568         }
1569
1570         return ret;
1571 }
1572
1573 /*
1574  * We change our unmap behavior slightly depending on whether the IOMMU
1575  * supports fine-grained superpages.  IOMMUs like AMD-Vi will use a superpage
1576  * for practically any contiguous power-of-two mapping we give it.  This means
1577  * we don't need to look for contiguous chunks ourselves to make unmapping
1578  * more efficient.  On IOMMUs with coarse-grained super pages, like Intel VT-d
1579  * with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1580  * significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1581  * hugetlbfs is in use.
1582  */
1583 static void vfio_test_domain_fgsp(struct vfio_domain *domain)
1584 {
1585         struct page *pages;
1586         int ret, order = get_order(PAGE_SIZE * 2);
1587
1588         pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
1589         if (!pages)
1590                 return;
1591
1592         ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
1593                         IOMMU_READ | IOMMU_WRITE | domain->prot);
1594         if (!ret) {
1595                 size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
1596
1597                 if (unmapped == PAGE_SIZE)
1598                         iommu_unmap(domain->domain, PAGE_SIZE, PAGE_SIZE);
1599                 else
1600                         domain->fgsp = true;
1601         }
1602
1603         __free_pages(pages, order);
1604 }
1605
1606 static struct vfio_group *find_iommu_group(struct vfio_domain *domain,
1607                                            struct iommu_group *iommu_group)
1608 {
1609         struct vfio_group *g;
1610
1611         list_for_each_entry(g, &domain->group_list, next) {
1612                 if (g->iommu_group == iommu_group)
1613                         return g;
1614         }
1615
1616         return NULL;
1617 }
1618
1619 static struct vfio_group *vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1620                                                struct iommu_group *iommu_group)
1621 {
1622         struct vfio_domain *domain;
1623         struct vfio_group *group = NULL;
1624
1625         list_for_each_entry(domain, &iommu->domain_list, next) {
1626                 group = find_iommu_group(domain, iommu_group);
1627                 if (group)
1628                         return group;
1629         }
1630
1631         if (iommu->external_domain)
1632                 group = find_iommu_group(iommu->external_domain, iommu_group);
1633
1634         return group;
1635 }
1636
1637 static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1638                                   phys_addr_t *base)
1639 {
1640         struct iommu_resv_region *region;
1641         bool ret = false;
1642
1643         list_for_each_entry(region, group_resv_regions, list) {
1644                 /*
1645                  * The presence of any 'real' MSI regions should take
1646                  * precedence over the software-managed one if the
1647                  * IOMMU driver happens to advertise both types.
1648                  */
1649                 if (region->type == IOMMU_RESV_MSI) {
1650                         ret = false;
1651                         break;
1652                 }
1653
1654                 if (region->type == IOMMU_RESV_SW_MSI) {
1655                         *base = region->start;
1656                         ret = true;
1657                 }
1658         }
1659
1660         return ret;
1661 }
1662
1663 static struct device *vfio_mdev_get_iommu_device(struct device *dev)
1664 {
1665         struct device *(*fn)(struct device *dev);
1666         struct device *iommu_device;
1667
1668         fn = symbol_get(mdev_get_iommu_device);
1669         if (fn) {
1670                 iommu_device = fn(dev);
1671                 symbol_put(mdev_get_iommu_device);
1672
1673                 return iommu_device;
1674         }
1675
1676         return NULL;
1677 }
1678
1679 static int vfio_mdev_attach_domain(struct device *dev, void *data)
1680 {
1681         struct iommu_domain *domain = data;
1682         struct device *iommu_device;
1683
1684         iommu_device = vfio_mdev_get_iommu_device(dev);
1685         if (iommu_device) {
1686                 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1687                         return iommu_aux_attach_device(domain, iommu_device);
1688                 else
1689                         return iommu_attach_device(domain, iommu_device);
1690         }
1691
1692         return -EINVAL;
1693 }
1694
1695 static int vfio_mdev_detach_domain(struct device *dev, void *data)
1696 {
1697         struct iommu_domain *domain = data;
1698         struct device *iommu_device;
1699
1700         iommu_device = vfio_mdev_get_iommu_device(dev);
1701         if (iommu_device) {
1702                 if (iommu_dev_feature_enabled(iommu_device, IOMMU_DEV_FEAT_AUX))
1703                         iommu_aux_detach_device(domain, iommu_device);
1704                 else
1705                         iommu_detach_device(domain, iommu_device);
1706         }
1707
1708         return 0;
1709 }
1710
1711 static int vfio_iommu_attach_group(struct vfio_domain *domain,
1712                                    struct vfio_group *group)
1713 {
1714         if (group->mdev_group)
1715                 return iommu_group_for_each_dev(group->iommu_group,
1716                                                 domain->domain,
1717                                                 vfio_mdev_attach_domain);
1718         else
1719                 return iommu_attach_group(domain->domain, group->iommu_group);
1720 }
1721
1722 static void vfio_iommu_detach_group(struct vfio_domain *domain,
1723                                     struct vfio_group *group)
1724 {
1725         if (group->mdev_group)
1726                 iommu_group_for_each_dev(group->iommu_group, domain->domain,
1727                                          vfio_mdev_detach_domain);
1728         else
1729                 iommu_detach_group(domain->domain, group->iommu_group);
1730 }
1731
1732 static bool vfio_bus_is_mdev(struct bus_type *bus)
1733 {
1734         struct bus_type *mdev_bus;
1735         bool ret = false;
1736
1737         mdev_bus = symbol_get(mdev_bus_type);
1738         if (mdev_bus) {
1739                 ret = (bus == mdev_bus);
1740                 symbol_put(mdev_bus_type);
1741         }
1742
1743         return ret;
1744 }
1745
1746 static int vfio_mdev_iommu_device(struct device *dev, void *data)
1747 {
1748         struct device **old = data, *new;
1749
1750         new = vfio_mdev_get_iommu_device(dev);
1751         if (!new || (*old && *old != new))
1752                 return -EINVAL;
1753
1754         *old = new;
1755
1756         return 0;
1757 }
1758
1759 /*
1760  * This is a helper function to insert an address range to iova list.
1761  * The list is initially created with a single entry corresponding to
1762  * the IOMMU domain geometry to which the device group is attached.
1763  * The list aperture gets modified when a new domain is added to the
1764  * container if the new aperture doesn't conflict with the current one
1765  * or with any existing dma mappings. The list is also modified to
1766  * exclude any reserved regions associated with the device group.
1767  */
1768 static int vfio_iommu_iova_insert(struct list_head *head,
1769                                   dma_addr_t start, dma_addr_t end)
1770 {
1771         struct vfio_iova *region;
1772
1773         region = kmalloc(sizeof(*region), GFP_KERNEL);
1774         if (!region)
1775                 return -ENOMEM;
1776
1777         INIT_LIST_HEAD(&region->list);
1778         region->start = start;
1779         region->end = end;
1780
1781         list_add_tail(&region->list, head);
1782         return 0;
1783 }
1784
1785 /*
1786  * Check the new iommu aperture conflicts with existing aper or with any
1787  * existing dma mappings.
1788  */
1789 static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1790                                      dma_addr_t start, dma_addr_t end)
1791 {
1792         struct vfio_iova *first, *last;
1793         struct list_head *iova = &iommu->iova_list;
1794
1795         if (list_empty(iova))
1796                 return false;
1797
1798         /* Disjoint sets, return conflict */
1799         first = list_first_entry(iova, struct vfio_iova, list);
1800         last = list_last_entry(iova, struct vfio_iova, list);
1801         if (start > last->end || end < first->start)
1802                 return true;
1803
1804         /* Check for any existing dma mappings below the new start */
1805         if (start > first->start) {
1806                 if (vfio_find_dma(iommu, first->start, start - first->start))
1807                         return true;
1808         }
1809
1810         /* Check for any existing dma mappings beyond the new end */
1811         if (end < last->end) {
1812                 if (vfio_find_dma(iommu, end + 1, last->end - end))
1813                         return true;
1814         }
1815
1816         return false;
1817 }
1818
1819 /*
1820  * Resize iommu iova aperture window. This is called only if the new
1821  * aperture has no conflict with existing aperture and dma mappings.
1822  */
1823 static int vfio_iommu_aper_resize(struct list_head *iova,
1824                                   dma_addr_t start, dma_addr_t end)
1825 {
1826         struct vfio_iova *node, *next;
1827
1828         if (list_empty(iova))
1829                 return vfio_iommu_iova_insert(iova, start, end);
1830
1831         /* Adjust iova list start */
1832         list_for_each_entry_safe(node, next, iova, list) {
1833                 if (start < node->start)
1834                         break;
1835                 if (start >= node->start && start < node->end) {
1836                         node->start = start;
1837                         break;
1838                 }
1839                 /* Delete nodes before new start */
1840                 list_del(&node->list);
1841                 kfree(node);
1842         }
1843
1844         /* Adjust iova list end */
1845         list_for_each_entry_safe(node, next, iova, list) {
1846                 if (end > node->end)
1847                         continue;
1848                 if (end > node->start && end <= node->end) {
1849                         node->end = end;
1850                         continue;
1851                 }
1852                 /* Delete nodes after new end */
1853                 list_del(&node->list);
1854                 kfree(node);
1855         }
1856
1857         return 0;
1858 }
1859
1860 /*
1861  * Check reserved region conflicts with existing dma mappings
1862  */
1863 static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
1864                                      struct list_head *resv_regions)
1865 {
1866         struct iommu_resv_region *region;
1867
1868         /* Check for conflict with existing dma mappings */
1869         list_for_each_entry(region, resv_regions, list) {
1870                 if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
1871                         continue;
1872
1873                 if (vfio_find_dma(iommu, region->start, region->length))
1874                         return true;
1875         }
1876
1877         return false;
1878 }
1879
1880 /*
1881  * Check iova region overlap with  reserved regions and
1882  * exclude them from the iommu iova range
1883  */
1884 static int vfio_iommu_resv_exclude(struct list_head *iova,
1885                                    struct list_head *resv_regions)
1886 {
1887         struct iommu_resv_region *resv;
1888         struct vfio_iova *n, *next;
1889
1890         list_for_each_entry(resv, resv_regions, list) {
1891                 phys_addr_t start, end;
1892
1893                 if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
1894                         continue;
1895
1896                 start = resv->start;
1897                 end = resv->start + resv->length - 1;
1898
1899                 list_for_each_entry_safe(n, next, iova, list) {
1900                         int ret = 0;
1901
1902                         /* No overlap */
1903                         if (start > n->end || end < n->start)
1904                                 continue;
1905                         /*
1906                          * Insert a new node if current node overlaps with the
1907                          * reserve region to exlude that from valid iova range.
1908                          * Note that, new node is inserted before the current
1909                          * node and finally the current node is deleted keeping
1910                          * the list updated and sorted.
1911                          */
1912                         if (start > n->start)
1913                                 ret = vfio_iommu_iova_insert(&n->list, n->start,
1914                                                              start - 1);
1915                         if (!ret && end < n->end)
1916                                 ret = vfio_iommu_iova_insert(&n->list, end + 1,
1917                                                              n->end);
1918                         if (ret)
1919                                 return ret;
1920
1921                         list_del(&n->list);
1922                         kfree(n);
1923                 }
1924         }
1925
1926         if (list_empty(iova))
1927                 return -EINVAL;
1928
1929         return 0;
1930 }
1931
1932 static void vfio_iommu_resv_free(struct list_head *resv_regions)
1933 {
1934         struct iommu_resv_region *n, *next;
1935
1936         list_for_each_entry_safe(n, next, resv_regions, list) {
1937                 list_del(&n->list);
1938                 kfree(n);
1939         }
1940 }
1941
1942 static void vfio_iommu_iova_free(struct list_head *iova)
1943 {
1944         struct vfio_iova *n, *next;
1945
1946         list_for_each_entry_safe(n, next, iova, list) {
1947                 list_del(&n->list);
1948                 kfree(n);
1949         }
1950 }
1951
1952 static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
1953                                     struct list_head *iova_copy)
1954 {
1955         struct list_head *iova = &iommu->iova_list;
1956         struct vfio_iova *n;
1957         int ret;
1958
1959         list_for_each_entry(n, iova, list) {
1960                 ret = vfio_iommu_iova_insert(iova_copy, n->start, n->end);
1961                 if (ret)
1962                         goto out_free;
1963         }
1964
1965         return 0;
1966
1967 out_free:
1968         vfio_iommu_iova_free(iova_copy);
1969         return ret;
1970 }
1971
1972 static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
1973                                         struct list_head *iova_copy)
1974 {
1975         struct list_head *iova = &iommu->iova_list;
1976
1977         vfio_iommu_iova_free(iova);
1978
1979         list_splice_tail(iova_copy, iova);
1980 }
1981
1982 static int vfio_iommu_type1_attach_group(void *iommu_data,
1983                                          struct iommu_group *iommu_group)
1984 {
1985         struct vfio_iommu *iommu = iommu_data;
1986         struct vfio_group *group;
1987         struct vfio_domain *domain, *d;
1988         struct bus_type *bus = NULL;
1989         int ret;
1990         bool resv_msi, msi_remap;
1991         phys_addr_t resv_msi_base = 0;
1992         struct iommu_domain_geometry geo;
1993         LIST_HEAD(iova_copy);
1994         LIST_HEAD(group_resv_regions);
1995
1996         mutex_lock(&iommu->lock);
1997
1998         /* Check for duplicates */
1999         if (vfio_iommu_find_iommu_group(iommu, iommu_group)) {
2000                 mutex_unlock(&iommu->lock);
2001                 return -EINVAL;
2002         }
2003
2004         group = kzalloc(sizeof(*group), GFP_KERNEL);
2005         domain = kzalloc(sizeof(*domain), GFP_KERNEL);
2006         if (!group || !domain) {
2007                 ret = -ENOMEM;
2008                 goto out_free;
2009         }
2010
2011         group->iommu_group = iommu_group;
2012
2013         /* Determine bus_type in order to allocate a domain */
2014         ret = iommu_group_for_each_dev(iommu_group, &bus, vfio_bus_type);
2015         if (ret)
2016                 goto out_free;
2017
2018         if (vfio_bus_is_mdev(bus)) {
2019                 struct device *iommu_device = NULL;
2020
2021                 group->mdev_group = true;
2022
2023                 /* Determine the isolation type */
2024                 ret = iommu_group_for_each_dev(iommu_group, &iommu_device,
2025                                                vfio_mdev_iommu_device);
2026                 if (ret || !iommu_device) {
2027                         if (!iommu->external_domain) {
2028                                 INIT_LIST_HEAD(&domain->group_list);
2029                                 iommu->external_domain = domain;
2030                                 vfio_update_pgsize_bitmap(iommu);
2031                         } else {
2032                                 kfree(domain);
2033                         }
2034
2035                         list_add(&group->next,
2036                                  &iommu->external_domain->group_list);
2037                         /*
2038                          * Non-iommu backed group cannot dirty memory directly,
2039                          * it can only use interfaces that provide dirty
2040                          * tracking.
2041                          * The iommu scope can only be promoted with the
2042                          * addition of a dirty tracking group.
2043                          */
2044                         group->pinned_page_dirty_scope = true;
2045                         mutex_unlock(&iommu->lock);
2046
2047                         return 0;
2048                 }
2049
2050                 bus = iommu_device->bus;
2051         }
2052
2053         domain->domain = iommu_domain_alloc(bus);
2054         if (!domain->domain) {
2055                 ret = -EIO;
2056                 goto out_free;
2057         }
2058
2059         if (iommu->nesting) {
2060                 int attr = 1;
2061
2062                 ret = iommu_domain_set_attr(domain->domain, DOMAIN_ATTR_NESTING,
2063                                             &attr);
2064                 if (ret)
2065                         goto out_domain;
2066         }
2067
2068         ret = vfio_iommu_attach_group(domain, group);
2069         if (ret)
2070                 goto out_domain;
2071
2072         /* Get aperture info */
2073         iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY, &geo);
2074
2075         if (vfio_iommu_aper_conflict(iommu, geo.aperture_start,
2076                                      geo.aperture_end)) {
2077                 ret = -EINVAL;
2078                 goto out_detach;
2079         }
2080
2081         ret = iommu_get_group_resv_regions(iommu_group, &group_resv_regions);
2082         if (ret)
2083                 goto out_detach;
2084
2085         if (vfio_iommu_resv_conflict(iommu, &group_resv_regions)) {
2086                 ret = -EINVAL;
2087                 goto out_detach;
2088         }
2089
2090         /*
2091          * We don't want to work on the original iova list as the list
2092          * gets modified and in case of failure we have to retain the
2093          * original list. Get a copy here.
2094          */
2095         ret = vfio_iommu_iova_get_copy(iommu, &iova_copy);
2096         if (ret)
2097                 goto out_detach;
2098
2099         ret = vfio_iommu_aper_resize(&iova_copy, geo.aperture_start,
2100                                      geo.aperture_end);
2101         if (ret)
2102                 goto out_detach;
2103
2104         ret = vfio_iommu_resv_exclude(&iova_copy, &group_resv_regions);
2105         if (ret)
2106                 goto out_detach;
2107
2108         resv_msi = vfio_iommu_has_sw_msi(&group_resv_regions, &resv_msi_base);
2109
2110         INIT_LIST_HEAD(&domain->group_list);
2111         list_add(&group->next, &domain->group_list);
2112
2113         msi_remap = irq_domain_check_msi_remap() ||
2114                     iommu_capable(bus, IOMMU_CAP_INTR_REMAP);
2115
2116         if (!allow_unsafe_interrupts && !msi_remap) {
2117                 pr_warn("%s: No interrupt remapping support.  Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2118                        __func__);
2119                 ret = -EPERM;
2120                 goto out_detach;
2121         }
2122
2123         if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
2124                 domain->prot |= IOMMU_CACHE;
2125
2126         /*
2127          * Try to match an existing compatible domain.  We don't want to
2128          * preclude an IOMMU driver supporting multiple bus_types and being
2129          * able to include different bus_types in the same IOMMU domain, so
2130          * we test whether the domains use the same iommu_ops rather than
2131          * testing if they're on the same bus_type.
2132          */
2133         list_for_each_entry(d, &iommu->domain_list, next) {
2134                 if (d->domain->ops == domain->domain->ops &&
2135                     d->prot == domain->prot) {
2136                         vfio_iommu_detach_group(domain, group);
2137                         if (!vfio_iommu_attach_group(d, group)) {
2138                                 list_add(&group->next, &d->group_list);
2139                                 iommu_domain_free(domain->domain);
2140                                 kfree(domain);
2141                                 goto done;
2142                         }
2143
2144                         ret = vfio_iommu_attach_group(domain, group);
2145                         if (ret)
2146                                 goto out_domain;
2147                 }
2148         }
2149
2150         vfio_test_domain_fgsp(domain);
2151
2152         /* replay mappings on new domains */
2153         ret = vfio_iommu_replay(iommu, domain);
2154         if (ret)
2155                 goto out_detach;
2156
2157         if (resv_msi) {
2158                 ret = iommu_get_msi_cookie(domain->domain, resv_msi_base);
2159                 if (ret && ret != -ENODEV)
2160                         goto out_detach;
2161         }
2162
2163         list_add(&domain->next, &iommu->domain_list);
2164         vfio_update_pgsize_bitmap(iommu);
2165 done:
2166         /* Delete the old one and insert new iova list */
2167         vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2168
2169         /*
2170          * An iommu backed group can dirty memory directly and therefore
2171          * demotes the iommu scope until it declares itself dirty tracking
2172          * capable via the page pinning interface.
2173          */
2174         iommu->num_non_pinned_groups++;
2175         mutex_unlock(&iommu->lock);
2176         vfio_iommu_resv_free(&group_resv_regions);
2177
2178         return 0;
2179
2180 out_detach:
2181         vfio_iommu_detach_group(domain, group);
2182 out_domain:
2183         iommu_domain_free(domain->domain);
2184         vfio_iommu_iova_free(&iova_copy);
2185         vfio_iommu_resv_free(&group_resv_regions);
2186 out_free:
2187         kfree(domain);
2188         kfree(group);
2189         mutex_unlock(&iommu->lock);
2190         return ret;
2191 }
2192
2193 static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2194 {
2195         struct rb_node *node;
2196
2197         while ((node = rb_first(&iommu->dma_list)))
2198                 vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2199 }
2200
2201 static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2202 {
2203         struct rb_node *n, *p;
2204
2205         n = rb_first(&iommu->dma_list);
2206         for (; n; n = rb_next(n)) {
2207                 struct vfio_dma *dma;
2208                 long locked = 0, unlocked = 0;
2209
2210                 dma = rb_entry(n, struct vfio_dma, node);
2211                 unlocked += vfio_unmap_unpin(iommu, dma, false);
2212                 p = rb_first(&dma->pfn_list);
2213                 for (; p; p = rb_next(p)) {
2214                         struct vfio_pfn *vpfn = rb_entry(p, struct vfio_pfn,
2215                                                          node);
2216
2217                         if (!is_invalid_reserved_pfn(vpfn->pfn))
2218                                 locked++;
2219                 }
2220                 vfio_lock_acct(dma, locked - unlocked, true);
2221         }
2222 }
2223
2224 /*
2225  * Called when a domain is removed in detach. It is possible that
2226  * the removed domain decided the iova aperture window. Modify the
2227  * iova aperture with the smallest window among existing domains.
2228  */
2229 static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2230                                    struct list_head *iova_copy)
2231 {
2232         struct vfio_domain *domain;
2233         struct iommu_domain_geometry geo;
2234         struct vfio_iova *node;
2235         dma_addr_t start = 0;
2236         dma_addr_t end = (dma_addr_t)~0;
2237
2238         if (list_empty(iova_copy))
2239                 return;
2240
2241         list_for_each_entry(domain, &iommu->domain_list, next) {
2242                 iommu_domain_get_attr(domain->domain, DOMAIN_ATTR_GEOMETRY,
2243                                       &geo);
2244                 if (geo.aperture_start > start)
2245                         start = geo.aperture_start;
2246                 if (geo.aperture_end < end)
2247                         end = geo.aperture_end;
2248         }
2249
2250         /* Modify aperture limits. The new aper is either same or bigger */
2251         node = list_first_entry(iova_copy, struct vfio_iova, list);
2252         node->start = start;
2253         node = list_last_entry(iova_copy, struct vfio_iova, list);
2254         node->end = end;
2255 }
2256
2257 /*
2258  * Called when a group is detached. The reserved regions for that
2259  * group can be part of valid iova now. But since reserved regions
2260  * may be duplicated among groups, populate the iova valid regions
2261  * list again.
2262  */
2263 static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2264                                    struct list_head *iova_copy)
2265 {
2266         struct vfio_domain *d;
2267         struct vfio_group *g;
2268         struct vfio_iova *node;
2269         dma_addr_t start, end;
2270         LIST_HEAD(resv_regions);
2271         int ret;
2272
2273         if (list_empty(iova_copy))
2274                 return -EINVAL;
2275
2276         list_for_each_entry(d, &iommu->domain_list, next) {
2277                 list_for_each_entry(g, &d->group_list, next) {
2278                         ret = iommu_get_group_resv_regions(g->iommu_group,
2279                                                            &resv_regions);
2280                         if (ret)
2281                                 goto done;
2282                 }
2283         }
2284
2285         node = list_first_entry(iova_copy, struct vfio_iova, list);
2286         start = node->start;
2287         node = list_last_entry(iova_copy, struct vfio_iova, list);
2288         end = node->end;
2289
2290         /* purge the iova list and create new one */
2291         vfio_iommu_iova_free(iova_copy);
2292
2293         ret = vfio_iommu_aper_resize(iova_copy, start, end);
2294         if (ret)
2295                 goto done;
2296
2297         /* Exclude current reserved regions from iova ranges */
2298         ret = vfio_iommu_resv_exclude(iova_copy, &resv_regions);
2299 done:
2300         vfio_iommu_resv_free(&resv_regions);
2301         return ret;
2302 }
2303
2304 static void vfio_iommu_type1_detach_group(void *iommu_data,
2305                                           struct iommu_group *iommu_group)
2306 {
2307         struct vfio_iommu *iommu = iommu_data;
2308         struct vfio_domain *domain;
2309         struct vfio_group *group;
2310         bool update_dirty_scope = false;
2311         LIST_HEAD(iova_copy);
2312
2313         mutex_lock(&iommu->lock);
2314
2315         if (iommu->external_domain) {
2316                 group = find_iommu_group(iommu->external_domain, iommu_group);
2317                 if (group) {
2318                         update_dirty_scope = !group->pinned_page_dirty_scope;
2319                         list_del(&group->next);
2320                         kfree(group);
2321
2322                         if (list_empty(&iommu->external_domain->group_list)) {
2323                                 if (!IS_IOMMU_CAP_DOMAIN_IN_CONTAINER(iommu)) {
2324                                         WARN_ON(iommu->notifier.head);
2325                                         vfio_iommu_unmap_unpin_all(iommu);
2326                                 }
2327
2328                                 kfree(iommu->external_domain);
2329                                 iommu->external_domain = NULL;
2330                         }
2331                         goto detach_group_done;
2332                 }
2333         }
2334
2335         /*
2336          * Get a copy of iova list. This will be used to update
2337          * and to replace the current one later. Please note that
2338          * we will leave the original list as it is if update fails.
2339          */
2340         vfio_iommu_iova_get_copy(iommu, &iova_copy);
2341
2342         list_for_each_entry(domain, &iommu->domain_list, next) {
2343                 group = find_iommu_group(domain, iommu_group);
2344                 if (!group)
2345                         continue;
2346
2347                 vfio_iommu_detach_group(domain, group);
2348                 update_dirty_scope = !group->pinned_page_dirty_scope;
2349                 list_del(&group->next);
2350                 kfree(group);
2351                 /*
2352                  * Group ownership provides privilege, if the group list is
2353                  * empty, the domain goes away. If it's the last domain with
2354                  * iommu and external domain doesn't exist, then all the
2355                  * mappings go away too. If it's the last domain with iommu and
2356                  * external domain exist, update accounting
2357                  */
2358                 if (list_empty(&domain->group_list)) {
2359                         if (list_is_singular(&iommu->domain_list)) {
2360                                 if (!iommu->external_domain) {
2361                                         WARN_ON(iommu->notifier.head);
2362                                         vfio_iommu_unmap_unpin_all(iommu);
2363                                 } else {
2364                                         vfio_iommu_unmap_unpin_reaccount(iommu);
2365                                 }
2366                         }
2367                         iommu_domain_free(domain->domain);
2368                         list_del(&domain->next);
2369                         kfree(domain);
2370                         vfio_iommu_aper_expand(iommu, &iova_copy);
2371                         vfio_update_pgsize_bitmap(iommu);
2372                 }
2373                 break;
2374         }
2375
2376         if (!vfio_iommu_resv_refresh(iommu, &iova_copy))
2377                 vfio_iommu_iova_insert_copy(iommu, &iova_copy);
2378         else
2379                 vfio_iommu_iova_free(&iova_copy);
2380
2381 detach_group_done:
2382         /*
2383          * Removal of a group without dirty tracking may allow the iommu scope
2384          * to be promoted.
2385          */
2386         if (update_dirty_scope) {
2387                 iommu->num_non_pinned_groups--;
2388                 if (iommu->dirty_page_tracking)
2389                         vfio_iommu_populate_bitmap_full(iommu);
2390         }
2391         mutex_unlock(&iommu->lock);
2392 }
2393
2394 static void *vfio_iommu_type1_open(unsigned long arg)
2395 {
2396         struct vfio_iommu *iommu;
2397
2398         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
2399         if (!iommu)
2400                 return ERR_PTR(-ENOMEM);
2401
2402         switch (arg) {
2403         case VFIO_TYPE1_IOMMU:
2404                 break;
2405         case VFIO_TYPE1_NESTING_IOMMU:
2406                 iommu->nesting = true;
2407                 fallthrough;
2408         case VFIO_TYPE1v2_IOMMU:
2409                 iommu->v2 = true;
2410                 break;
2411         default:
2412                 kfree(iommu);
2413                 return ERR_PTR(-EINVAL);
2414         }
2415
2416         INIT_LIST_HEAD(&iommu->domain_list);
2417         INIT_LIST_HEAD(&iommu->iova_list);
2418         iommu->dma_list = RB_ROOT;
2419         iommu->dma_avail = dma_entry_limit;
2420         mutex_init(&iommu->lock);
2421         BLOCKING_INIT_NOTIFIER_HEAD(&iommu->notifier);
2422
2423         return iommu;
2424 }
2425
2426 static void vfio_release_domain(struct vfio_domain *domain, bool external)
2427 {
2428         struct vfio_group *group, *group_tmp;
2429
2430         list_for_each_entry_safe(group, group_tmp,
2431                                  &domain->group_list, next) {
2432                 if (!external)
2433                         vfio_iommu_detach_group(domain, group);
2434                 list_del(&group->next);
2435                 kfree(group);
2436         }
2437
2438         if (!external)
2439                 iommu_domain_free(domain->domain);
2440 }
2441
2442 static void vfio_iommu_type1_release(void *iommu_data)
2443 {
2444         struct vfio_iommu *iommu = iommu_data;
2445         struct vfio_domain *domain, *domain_tmp;
2446
2447         if (iommu->external_domain) {
2448                 vfio_release_domain(iommu->external_domain, true);
2449                 kfree(iommu->external_domain);
2450         }
2451
2452         vfio_iommu_unmap_unpin_all(iommu);
2453
2454         list_for_each_entry_safe(domain, domain_tmp,
2455                                  &iommu->domain_list, next) {
2456                 vfio_release_domain(domain, false);
2457                 list_del(&domain->next);
2458                 kfree(domain);
2459         }
2460
2461         vfio_iommu_iova_free(&iommu->iova_list);
2462
2463         kfree(iommu);
2464 }
2465
2466 static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
2467 {
2468         struct vfio_domain *domain;
2469         int ret = 1;
2470
2471         mutex_lock(&iommu->lock);
2472         list_for_each_entry(domain, &iommu->domain_list, next) {
2473                 if (!(domain->prot & IOMMU_CACHE)) {
2474                         ret = 0;
2475                         break;
2476                 }
2477         }
2478         mutex_unlock(&iommu->lock);
2479
2480         return ret;
2481 }
2482
2483 static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2484                                             unsigned long arg)
2485 {
2486         switch (arg) {
2487         case VFIO_TYPE1_IOMMU:
2488         case VFIO_TYPE1v2_IOMMU:
2489         case VFIO_TYPE1_NESTING_IOMMU:
2490                 return 1;
2491         case VFIO_DMA_CC_IOMMU:
2492                 if (!iommu)
2493                         return 0;
2494                 return vfio_domains_have_iommu_cache(iommu);
2495         default:
2496                 return 0;
2497         }
2498 }
2499
2500 static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2501                  struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2502                  size_t size)
2503 {
2504         struct vfio_info_cap_header *header;
2505         struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2506
2507         header = vfio_info_cap_add(caps, size,
2508                                    VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, 1);
2509         if (IS_ERR(header))
2510                 return PTR_ERR(header);
2511
2512         iova_cap = container_of(header,
2513                                 struct vfio_iommu_type1_info_cap_iova_range,
2514                                 header);
2515         iova_cap->nr_iovas = cap_iovas->nr_iovas;
2516         memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2517                cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2518         return 0;
2519 }
2520
2521 static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2522                                       struct vfio_info_cap *caps)
2523 {
2524         struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2525         struct vfio_iova *iova;
2526         size_t size;
2527         int iovas = 0, i = 0, ret;
2528
2529         list_for_each_entry(iova, &iommu->iova_list, list)
2530                 iovas++;
2531
2532         if (!iovas) {
2533                 /*
2534                  * Return 0 as a container with a single mdev device
2535                  * will have an empty list
2536                  */
2537                 return 0;
2538         }
2539
2540         size = sizeof(*cap_iovas) + (iovas * sizeof(*cap_iovas->iova_ranges));
2541
2542         cap_iovas = kzalloc(size, GFP_KERNEL);
2543         if (!cap_iovas)
2544                 return -ENOMEM;
2545
2546         cap_iovas->nr_iovas = iovas;
2547
2548         list_for_each_entry(iova, &iommu->iova_list, list) {
2549                 cap_iovas->iova_ranges[i].start = iova->start;
2550                 cap_iovas->iova_ranges[i].end = iova->end;
2551                 i++;
2552         }
2553
2554         ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2555
2556         kfree(cap_iovas);
2557         return ret;
2558 }
2559
2560 static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2561                                            struct vfio_info_cap *caps)
2562 {
2563         struct vfio_iommu_type1_info_cap_migration cap_mig;
2564
2565         cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2566         cap_mig.header.version = 1;
2567
2568         cap_mig.flags = 0;
2569         /* support minimum pgsize */
2570         cap_mig.pgsize_bitmap = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2571         cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2572
2573         return vfio_info_add_capability(caps, &cap_mig.header, sizeof(cap_mig));
2574 }
2575
2576 static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2577                                            struct vfio_info_cap *caps)
2578 {
2579         struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2580
2581         cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2582         cap_dma_avail.header.version = 1;
2583
2584         cap_dma_avail.avail = iommu->dma_avail;
2585
2586         return vfio_info_add_capability(caps, &cap_dma_avail.header,
2587                                         sizeof(cap_dma_avail));
2588 }
2589
2590 static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2591                                      unsigned long arg)
2592 {
2593         struct vfio_iommu_type1_info info;
2594         unsigned long minsz;
2595         struct vfio_info_cap caps = { .buf = NULL, .size = 0 };
2596         unsigned long capsz;
2597         int ret;
2598
2599         minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2600
2601         /* For backward compatibility, cannot require this */
2602         capsz = offsetofend(struct vfio_iommu_type1_info, cap_offset);
2603
2604         if (copy_from_user(&info, (void __user *)arg, minsz))
2605                 return -EFAULT;
2606
2607         if (info.argsz < minsz)
2608                 return -EINVAL;
2609
2610         if (info.argsz >= capsz) {
2611                 minsz = capsz;
2612                 info.cap_offset = 0; /* output, no-recopy necessary */
2613         }
2614
2615         mutex_lock(&iommu->lock);
2616         info.flags = VFIO_IOMMU_INFO_PGSIZES;
2617
2618         info.iova_pgsizes = iommu->pgsize_bitmap;
2619
2620         ret = vfio_iommu_migration_build_caps(iommu, &caps);
2621
2622         if (!ret)
2623                 ret = vfio_iommu_dma_avail_build_caps(iommu, &caps);
2624
2625         if (!ret)
2626                 ret = vfio_iommu_iova_build_caps(iommu, &caps);
2627
2628         mutex_unlock(&iommu->lock);
2629
2630         if (ret)
2631                 return ret;
2632
2633         if (caps.size) {
2634                 info.flags |= VFIO_IOMMU_INFO_CAPS;
2635
2636                 if (info.argsz < sizeof(info) + caps.size) {
2637                         info.argsz = sizeof(info) + caps.size;
2638                 } else {
2639                         vfio_info_cap_shift(&caps, sizeof(info));
2640                         if (copy_to_user((void __user *)arg +
2641                                         sizeof(info), caps.buf,
2642                                         caps.size)) {
2643                                 kfree(caps.buf);
2644                                 return -EFAULT;
2645                         }
2646                         info.cap_offset = sizeof(info);
2647                 }
2648
2649                 kfree(caps.buf);
2650         }
2651
2652         return copy_to_user((void __user *)arg, &info, minsz) ?
2653                         -EFAULT : 0;
2654 }
2655
2656 static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2657                                     unsigned long arg)
2658 {
2659         struct vfio_iommu_type1_dma_map map;
2660         unsigned long minsz;
2661         uint32_t mask = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
2662
2663         minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2664
2665         if (copy_from_user(&map, (void __user *)arg, minsz))
2666                 return -EFAULT;
2667
2668         if (map.argsz < minsz || map.flags & ~mask)
2669                 return -EINVAL;
2670
2671         return vfio_dma_do_map(iommu, &map);
2672 }
2673
2674 static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2675                                       unsigned long arg)
2676 {
2677         struct vfio_iommu_type1_dma_unmap unmap;
2678         struct vfio_bitmap bitmap = { 0 };
2679         unsigned long minsz;
2680         int ret;
2681
2682         minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2683
2684         if (copy_from_user(&unmap, (void __user *)arg, minsz))
2685                 return -EFAULT;
2686
2687         if (unmap.argsz < minsz ||
2688             unmap.flags & ~VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP)
2689                 return -EINVAL;
2690
2691         if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2692                 unsigned long pgshift;
2693
2694                 if (unmap.argsz < (minsz + sizeof(bitmap)))
2695                         return -EINVAL;
2696
2697                 if (copy_from_user(&bitmap,
2698                                    (void __user *)(arg + minsz),
2699                                    sizeof(bitmap)))
2700                         return -EFAULT;
2701
2702                 if (!access_ok((void __user *)bitmap.data, bitmap.size))
2703                         return -EINVAL;
2704
2705                 pgshift = __ffs(bitmap.pgsize);
2706                 ret = verify_bitmap_size(unmap.size >> pgshift,
2707                                          bitmap.size);
2708                 if (ret)
2709                         return ret;
2710         }
2711
2712         ret = vfio_dma_do_unmap(iommu, &unmap, &bitmap);
2713         if (ret)
2714                 return ret;
2715
2716         return copy_to_user((void __user *)arg, &unmap, minsz) ?
2717                         -EFAULT : 0;
2718 }
2719
2720 static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2721                                         unsigned long arg)
2722 {
2723         struct vfio_iommu_type1_dirty_bitmap dirty;
2724         uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START |
2725                         VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP |
2726                         VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2727         unsigned long minsz;
2728         int ret = 0;
2729
2730         if (!iommu->v2)
2731                 return -EACCES;
2732
2733         minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2734
2735         if (copy_from_user(&dirty, (void __user *)arg, minsz))
2736                 return -EFAULT;
2737
2738         if (dirty.argsz < minsz || dirty.flags & ~mask)
2739                 return -EINVAL;
2740
2741         /* only one flag should be set at a time */
2742         if (__ffs(dirty.flags) != __fls(dirty.flags))
2743                 return -EINVAL;
2744
2745         if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2746                 size_t pgsize;
2747
2748                 mutex_lock(&iommu->lock);
2749                 pgsize = 1 << __ffs(iommu->pgsize_bitmap);
2750                 if (!iommu->dirty_page_tracking) {
2751                         ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2752                         if (!ret)
2753                                 iommu->dirty_page_tracking = true;
2754                 }
2755                 mutex_unlock(&iommu->lock);
2756                 return ret;
2757         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2758                 mutex_lock(&iommu->lock);
2759                 if (iommu->dirty_page_tracking) {
2760                         iommu->dirty_page_tracking = false;
2761                         vfio_dma_bitmap_free_all(iommu);
2762                 }
2763                 mutex_unlock(&iommu->lock);
2764                 return 0;
2765         } else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2766                 struct vfio_iommu_type1_dirty_bitmap_get range;
2767                 unsigned long pgshift;
2768                 size_t data_size = dirty.argsz - minsz;
2769                 size_t iommu_pgsize;
2770
2771                 if (!data_size || data_size < sizeof(range))
2772                         return -EINVAL;
2773
2774                 if (copy_from_user(&range, (void __user *)(arg + minsz),
2775                                    sizeof(range)))
2776                         return -EFAULT;
2777
2778                 if (range.iova + range.size < range.iova)
2779                         return -EINVAL;
2780                 if (!access_ok((void __user *)range.bitmap.data,
2781                                range.bitmap.size))
2782                         return -EINVAL;
2783
2784                 pgshift = __ffs(range.bitmap.pgsize);
2785                 ret = verify_bitmap_size(range.size >> pgshift,
2786                                          range.bitmap.size);
2787                 if (ret)
2788                         return ret;
2789
2790                 mutex_lock(&iommu->lock);
2791
2792                 iommu_pgsize = (size_t)1 << __ffs(iommu->pgsize_bitmap);
2793
2794                 /* allow only smallest supported pgsize */
2795                 if (range.bitmap.pgsize != iommu_pgsize) {
2796                         ret = -EINVAL;
2797                         goto out_unlock;
2798                 }
2799                 if (range.iova & (iommu_pgsize - 1)) {
2800                         ret = -EINVAL;
2801                         goto out_unlock;
2802                 }
2803                 if (!range.size || range.size & (iommu_pgsize - 1)) {
2804                         ret = -EINVAL;
2805                         goto out_unlock;
2806                 }
2807
2808                 if (iommu->dirty_page_tracking)
2809                         ret = vfio_iova_dirty_bitmap(range.bitmap.data,
2810                                                      iommu, range.iova,
2811                                                      range.size,
2812                                                      range.bitmap.pgsize);
2813                 else
2814                         ret = -EINVAL;
2815 out_unlock:
2816                 mutex_unlock(&iommu->lock);
2817
2818                 return ret;
2819         }
2820
2821         return -EINVAL;
2822 }
2823
2824 static long vfio_iommu_type1_ioctl(void *iommu_data,
2825                                    unsigned int cmd, unsigned long arg)
2826 {
2827         struct vfio_iommu *iommu = iommu_data;
2828
2829         switch (cmd) {
2830         case VFIO_CHECK_EXTENSION:
2831                 return vfio_iommu_type1_check_extension(iommu, arg);
2832         case VFIO_IOMMU_GET_INFO:
2833                 return vfio_iommu_type1_get_info(iommu, arg);
2834         case VFIO_IOMMU_MAP_DMA:
2835                 return vfio_iommu_type1_map_dma(iommu, arg);
2836         case VFIO_IOMMU_UNMAP_DMA:
2837                 return vfio_iommu_type1_unmap_dma(iommu, arg);
2838         case VFIO_IOMMU_DIRTY_PAGES:
2839                 return vfio_iommu_type1_dirty_pages(iommu, arg);
2840         default:
2841                 return -ENOTTY;
2842         }
2843 }
2844
2845 static int vfio_iommu_type1_register_notifier(void *iommu_data,
2846                                               unsigned long *events,
2847                                               struct notifier_block *nb)
2848 {
2849         struct vfio_iommu *iommu = iommu_data;
2850
2851         /* clear known events */
2852         *events &= ~VFIO_IOMMU_NOTIFY_DMA_UNMAP;
2853
2854         /* refuse to register if still events remaining */
2855         if (*events)
2856                 return -EINVAL;
2857
2858         return blocking_notifier_chain_register(&iommu->notifier, nb);
2859 }
2860
2861 static int vfio_iommu_type1_unregister_notifier(void *iommu_data,
2862                                                 struct notifier_block *nb)
2863 {
2864         struct vfio_iommu *iommu = iommu_data;
2865
2866         return blocking_notifier_chain_unregister(&iommu->notifier, nb);
2867 }
2868
2869 static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
2870                                          dma_addr_t user_iova, void *data,
2871                                          size_t count, bool write,
2872                                          size_t *copied)
2873 {
2874         struct mm_struct *mm;
2875         unsigned long vaddr;
2876         struct vfio_dma *dma;
2877         bool kthread = current->mm == NULL;
2878         size_t offset;
2879
2880         *copied = 0;
2881
2882         dma = vfio_find_dma(iommu, user_iova, 1);
2883         if (!dma)
2884                 return -EINVAL;
2885
2886         if ((write && !(dma->prot & IOMMU_WRITE)) ||
2887                         !(dma->prot & IOMMU_READ))
2888                 return -EPERM;
2889
2890         mm = get_task_mm(dma->task);
2891
2892         if (!mm)
2893                 return -EPERM;
2894
2895         if (kthread)
2896                 kthread_use_mm(mm);
2897         else if (current->mm != mm)
2898                 goto out;
2899
2900         offset = user_iova - dma->iova;
2901
2902         if (count > dma->size - offset)
2903                 count = dma->size - offset;
2904
2905         vaddr = dma->vaddr + offset;
2906
2907         if (write) {
2908                 *copied = copy_to_user((void __user *)vaddr, data,
2909                                          count) ? 0 : count;
2910                 if (*copied && iommu->dirty_page_tracking) {
2911                         unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
2912                         /*
2913                          * Bitmap populated with the smallest supported page
2914                          * size
2915                          */
2916                         bitmap_set(dma->bitmap, offset >> pgshift,
2917                                    ((offset + *copied - 1) >> pgshift) -
2918                                    (offset >> pgshift) + 1);
2919                 }
2920         } else
2921                 *copied = copy_from_user(data, (void __user *)vaddr,
2922                                            count) ? 0 : count;
2923         if (kthread)
2924                 kthread_unuse_mm(mm);
2925 out:
2926         mmput(mm);
2927         return *copied ? 0 : -EFAULT;
2928 }
2929
2930 static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
2931                                    void *data, size_t count, bool write)
2932 {
2933         struct vfio_iommu *iommu = iommu_data;
2934         int ret = 0;
2935         size_t done;
2936
2937         mutex_lock(&iommu->lock);
2938         while (count > 0) {
2939                 ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
2940                                                     count, write, &done);
2941                 if (ret)
2942                         break;
2943
2944                 count -= done;
2945                 data += done;
2946                 user_iova += done;
2947         }
2948
2949         mutex_unlock(&iommu->lock);
2950         return ret;
2951 }
2952
2953 static struct iommu_domain *
2954 vfio_iommu_type1_group_iommu_domain(void *iommu_data,
2955                                     struct iommu_group *iommu_group)
2956 {
2957         struct iommu_domain *domain = ERR_PTR(-ENODEV);
2958         struct vfio_iommu *iommu = iommu_data;
2959         struct vfio_domain *d;
2960
2961         if (!iommu || !iommu_group)
2962                 return ERR_PTR(-EINVAL);
2963
2964         mutex_lock(&iommu->lock);
2965         list_for_each_entry(d, &iommu->domain_list, next) {
2966                 if (find_iommu_group(d, iommu_group)) {
2967                         domain = d->domain;
2968                         break;
2969                 }
2970         }
2971         mutex_unlock(&iommu->lock);
2972
2973         return domain;
2974 }
2975
2976 static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
2977         .name                   = "vfio-iommu-type1",
2978         .owner                  = THIS_MODULE,
2979         .open                   = vfio_iommu_type1_open,
2980         .release                = vfio_iommu_type1_release,
2981         .ioctl                  = vfio_iommu_type1_ioctl,
2982         .attach_group           = vfio_iommu_type1_attach_group,
2983         .detach_group           = vfio_iommu_type1_detach_group,
2984         .pin_pages              = vfio_iommu_type1_pin_pages,
2985         .unpin_pages            = vfio_iommu_type1_unpin_pages,
2986         .register_notifier      = vfio_iommu_type1_register_notifier,
2987         .unregister_notifier    = vfio_iommu_type1_unregister_notifier,
2988         .dma_rw                 = vfio_iommu_type1_dma_rw,
2989         .group_iommu_domain     = vfio_iommu_type1_group_iommu_domain,
2990 };
2991
2992 static int __init vfio_iommu_type1_init(void)
2993 {
2994         return vfio_register_iommu_driver(&vfio_iommu_driver_ops_type1);
2995 }
2996
2997 static void __exit vfio_iommu_type1_cleanup(void)
2998 {
2999         vfio_unregister_iommu_driver(&vfio_iommu_driver_ops_type1);
3000 }
3001
3002 module_init(vfio_iommu_type1_init);
3003 module_exit(vfio_iommu_type1_cleanup);
3004
3005 MODULE_VERSION(DRIVER_VERSION);
3006 MODULE_LICENSE("GPL v2");
3007 MODULE_AUTHOR(DRIVER_AUTHOR);
3008 MODULE_DESCRIPTION(DRIVER_DESC);