iommufd: Add IOMMU_IOAS_CHANGE_PROCESS
authorSteve Sistare <steven.sistare@oracle.com>
Wed, 13 Nov 2024 19:51:36 +0000 (11:51 -0800)
committerJason Gunthorpe <jgg@nvidia.com>
Thu, 14 Nov 2024 16:57:13 +0000 (12:57 -0400)
Add an ioctl that updates all DMA mappings to reflect the current process,
Change the mm and transfer locked memory accounting from old to current mm.
This will be used for live update, allowing an old process to hand the
iommufd device descriptor to a new process.  The new process calls the
ioctl.

IOMMU_IOAS_CHANGE_PROCESS only supports DMA mappings created with
IOMMU_IOAS_MAP_FILE, because the kernel metadata for such mappings does
not depend on the userland VA of the pages (which is different in the new
process).
IOMMU_IOAS_CHANGE_PROCESS fails if other types of mappings are present.

This is a revised version of code originally provided by Jason.

Link: https://patch.msgid.link/r/1731527497-16091-4-git-send-email-steven.sistare@oracle.com
Suggested-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Steve Sistare <steven.sistare@oracle.com>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Reviewed-by: Kevin Tian <kevin.tian@intel.com>
Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
drivers/iommu/iommufd/io_pagetable.h
drivers/iommu/iommufd/ioas.c
drivers/iommu/iommufd/iommufd_private.h
drivers/iommu/iommufd/main.c
include/uapi/linux/iommufd.h

index f5f20fa..10c928a 100644 (file)
@@ -173,6 +173,7 @@ enum {
        IOPT_PAGES_ACCOUNT_NONE = 0,
        IOPT_PAGES_ACCOUNT_USER = 1,
        IOPT_PAGES_ACCOUNT_MM = 2,
+       IOPT_PAGES_ACCOUNT_MODE_NUM = 3,
 };
 
 enum iopt_address_type {
index c82ed5a..1542c5f 100644 (file)
@@ -439,6 +439,153 @@ static int iommufd_take_all_iova_rwsem(struct iommufd_ctx *ictx,
        return 0;
 }
 
+static bool need_charge_update(struct iopt_pages *pages)
+{
+       switch (pages->account_mode) {
+       case IOPT_PAGES_ACCOUNT_NONE:
+               return false;
+       case IOPT_PAGES_ACCOUNT_MM:
+               return pages->source_mm != current->mm;
+       case IOPT_PAGES_ACCOUNT_USER:
+               /*
+                * Update when mm changes because it also accounts
+                * in mm->pinned_vm.
+                */
+               return (pages->source_user != current_user()) ||
+                      (pages->source_mm != current->mm);
+       }
+       return true;
+}
+
+static int charge_current(unsigned long *npinned)
+{
+       struct iopt_pages tmp = {
+               .source_mm = current->mm,
+               .source_task = current->group_leader,
+               .source_user = current_user(),
+       };
+       unsigned int account_mode;
+       int rc;
+
+       for (account_mode = 0; account_mode != IOPT_PAGES_ACCOUNT_MODE_NUM;
+            account_mode++) {
+               if (!npinned[account_mode])
+                       continue;
+
+               tmp.account_mode = account_mode;
+               rc = iopt_pages_update_pinned(&tmp, npinned[account_mode], true,
+                                             NULL);
+               if (rc)
+                       goto err_undo;
+       }
+       return 0;
+
+err_undo:
+       while (account_mode != 0) {
+               account_mode--;
+               if (!npinned[account_mode])
+                       continue;
+               tmp.account_mode = account_mode;
+               iopt_pages_update_pinned(&tmp, npinned[account_mode], false,
+                                        NULL);
+       }
+       return rc;
+}
+
+static void change_mm(struct iopt_pages *pages)
+{
+       struct task_struct *old_task = pages->source_task;
+       struct user_struct *old_user = pages->source_user;
+       struct mm_struct *old_mm = pages->source_mm;
+
+       pages->source_mm = current->mm;
+       mmgrab(pages->source_mm);
+       mmdrop(old_mm);
+
+       pages->source_task = current->group_leader;
+       get_task_struct(pages->source_task);
+       put_task_struct(old_task);
+
+       pages->source_user = get_uid(current_user());
+       free_uid(old_user);
+}
+
+#define for_each_ioas_area(_xa, _index, _ioas, _area) \
+       xa_for_each((_xa), (_index), (_ioas)) \
+               for (_area = iopt_area_iter_first(&_ioas->iopt, 0, ULONG_MAX); \
+                    _area; \
+                    _area = iopt_area_iter_next(_area, 0, ULONG_MAX))
+
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd)
+{
+       struct iommu_ioas_change_process *cmd = ucmd->cmd;
+       struct iommufd_ctx *ictx = ucmd->ictx;
+       unsigned long all_npinned[IOPT_PAGES_ACCOUNT_MODE_NUM] = {};
+       struct iommufd_ioas *ioas;
+       struct iopt_area *area;
+       struct iopt_pages *pages;
+       struct xarray ioas_list;
+       unsigned long index;
+       int rc;
+
+       if (cmd->__reserved)
+               return -EOPNOTSUPP;
+
+       xa_init(&ioas_list);
+       rc = iommufd_take_all_iova_rwsem(ictx, &ioas_list);
+       if (rc)
+               return rc;
+
+       for_each_ioas_area(&ioas_list, index, ioas, area)  {
+               if (area->pages->type != IOPT_ADDRESS_FILE) {
+                       rc = -EINVAL;
+                       goto out;
+               }
+       }
+
+       /*
+        * Count last_pinned pages, then clear it to avoid double counting
+        * if the same iopt_pages is visited multiple times in this loop.
+        * Since we are under all the locks, npinned == last_npinned, so we
+        * can easily restore last_npinned before we return.
+        */
+       for_each_ioas_area(&ioas_list, index, ioas, area)  {
+               pages = area->pages;
+
+               if (need_charge_update(pages)) {
+                       all_npinned[pages->account_mode] += pages->last_npinned;
+                       pages->last_npinned = 0;
+               }
+       }
+
+       rc = charge_current(all_npinned);
+
+       if (rc) {
+               /* Charge failed.  Fix last_npinned and bail. */
+               for_each_ioas_area(&ioas_list, index, ioas, area)
+                       area->pages->last_npinned = area->pages->npinned;
+               goto out;
+       }
+
+       for_each_ioas_area(&ioas_list, index, ioas, area) {
+               pages = area->pages;
+
+               /* Uncharge the old one (which also restores last_npinned) */
+               if (need_charge_update(pages)) {
+                       int r = iopt_pages_update_pinned(pages, pages->npinned,
+                                                        false, NULL);
+
+                       if (WARN_ON(r))
+                               rc = r;
+               }
+               change_mm(pages);
+       }
+
+out:
+       iommufd_release_all_iova_rwsem(ictx, &ioas_list);
+       return rc;
+}
+
 int iommufd_option_rlimit_mode(struct iommu_option *cmd,
                               struct iommufd_ctx *ictx)
 {
index 57c0c8f..b6d706c 100644 (file)
@@ -255,6 +255,7 @@ int iommufd_ioas_iova_ranges(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_allow_iovas(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_map(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_map_file(struct iommufd_ucmd *ucmd);
+int iommufd_ioas_change_process(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_copy(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_unmap(struct iommufd_ucmd *ucmd);
 int iommufd_ioas_option(struct iommufd_ucmd *ucmd);
index 13ac228..0a96cc8 100644 (file)
@@ -349,6 +349,8 @@ static const struct iommufd_ioctl_op iommufd_ioctl_ops[] = {
                 struct iommu_ioas_alloc, out_ioas_id),
        IOCTL_OP(IOMMU_IOAS_ALLOW_IOVAS, iommufd_ioas_allow_iovas,
                 struct iommu_ioas_allow_iovas, allowed_iovas),
+       IOCTL_OP(IOMMU_IOAS_CHANGE_PROCESS, iommufd_ioas_change_process,
+                struct iommu_ioas_change_process, __reserved),
        IOCTL_OP(IOMMU_IOAS_COPY, iommufd_ioas_copy, struct iommu_ioas_copy,
                 src_iova),
        IOCTL_OP(IOMMU_IOAS_IOVA_RANGES, iommufd_ioas_iova_ranges,
index 747d3d9..4ae8b1e 100644 (file)
@@ -54,6 +54,7 @@ enum {
        IOMMUFD_CMD_IOAS_MAP_FILE = 0x8f,
        IOMMUFD_CMD_VIOMMU_ALLOC = 0x90,
        IOMMUFD_CMD_VDEVICE_ALLOC = 0x91,
+       IOMMUFD_CMD_IOAS_CHANGE_PROCESS = 0x92,
 };
 
 /**
@@ -972,4 +973,26 @@ struct iommu_vdevice_alloc {
        __aligned_u64 virt_id;
 };
 #define IOMMU_VDEVICE_ALLOC _IO(IOMMUFD_TYPE, IOMMUFD_CMD_VDEVICE_ALLOC)
+
+/**
+ * struct iommu_ioas_change_process - ioctl(VFIO_IOAS_CHANGE_PROCESS)
+ * @size: sizeof(struct iommu_ioas_change_process)
+ * @__reserved: Must be 0
+ *
+ * This transfers pinned memory counts for every memory map in every IOAS
+ * in the context to the current process.  This only supports maps created
+ * with IOMMU_IOAS_MAP_FILE, and returns EINVAL if other maps are present.
+ * If the ioctl returns a failure status, then nothing is changed.
+ *
+ * This API is useful for transferring operation of a device from one process
+ * to another, such as during userland live update.
+ */
+struct iommu_ioas_change_process {
+       __u32 size;
+       __u32 __reserved;
+};
+
+#define IOMMU_IOAS_CHANGE_PROCESS \
+       _IO(IOMMUFD_TYPE, IOMMUFD_CMD_IOAS_CHANGE_PROCESS)
+
 #endif