vduse: Support registering userspace memory for IOVA regions
authorXie Yongji <xieyongji@bytedance.com>
Wed, 3 Aug 2022 04:55:22 +0000 (12:55 +0800)
committerMichael S. Tsirkin <mst@redhat.com>
Thu, 11 Aug 2022 08:26:08 +0000 (04:26 -0400)
Introduce two ioctls: VDUSE_IOTLB_REG_UMEM and
VDUSE_IOTLB_DEREG_UMEM to support registering
and de-registering userspace memory for IOVA
regions.

Now it only supports registering userspace memory
for bounce buffer region in virtio-vdpa case.

Signed-off-by: Xie Yongji <xieyongji@bytedance.com>
Acked-by: Jason Wang <jasowang@redhat.com>
Message-Id: <20220803045523.23851-5-xieyongji@bytedance.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
drivers/vdpa/vdpa_user/vduse_dev.c
include/uapi/linux/vduse.h

index 3bc27de..eedff0a 100644 (file)
@@ -21,6 +21,8 @@
 #include <linux/uio.h>
 #include <linux/vdpa.h>
 #include <linux/nospec.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
 #include <uapi/linux/vduse.h>
 #include <uapi/linux/vdpa.h>
 #include <uapi/linux/virtio_config.h>
@@ -64,6 +66,13 @@ struct vduse_vdpa {
        struct vduse_dev *dev;
 };
 
+struct vduse_umem {
+       unsigned long iova;
+       unsigned long npages;
+       struct page **pages;
+       struct mm_struct *mm;
+};
+
 struct vduse_dev {
        struct vduse_vdpa *vdev;
        struct device *dev;
@@ -95,6 +104,8 @@ struct vduse_dev {
        u8 status;
        u32 vq_num;
        u32 vq_align;
+       struct vduse_umem *umem;
+       struct mutex mem_lock;
 };
 
 struct vduse_dev_msg {
@@ -917,6 +928,102 @@ unlock:
        return ret;
 }
 
+static int vduse_dev_dereg_umem(struct vduse_dev *dev,
+                               u64 iova, u64 size)
+{
+       int ret;
+
+       mutex_lock(&dev->mem_lock);
+       ret = -ENOENT;
+       if (!dev->umem)
+               goto unlock;
+
+       ret = -EINVAL;
+       if (dev->umem->iova != iova || size != dev->domain->bounce_size)
+               goto unlock;
+
+       vduse_domain_remove_user_bounce_pages(dev->domain);
+       unpin_user_pages_dirty_lock(dev->umem->pages,
+                                   dev->umem->npages, true);
+       atomic64_sub(dev->umem->npages, &dev->umem->mm->pinned_vm);
+       mmdrop(dev->umem->mm);
+       vfree(dev->umem->pages);
+       kfree(dev->umem);
+       dev->umem = NULL;
+       ret = 0;
+unlock:
+       mutex_unlock(&dev->mem_lock);
+       return ret;
+}
+
+static int vduse_dev_reg_umem(struct vduse_dev *dev,
+                             u64 iova, u64 uaddr, u64 size)
+{
+       struct page **page_list = NULL;
+       struct vduse_umem *umem = NULL;
+       long pinned = 0;
+       unsigned long npages, lock_limit;
+       int ret;
+
+       if (!dev->domain->bounce_map ||
+           size != dev->domain->bounce_size ||
+           iova != 0 || uaddr & ~PAGE_MASK)
+               return -EINVAL;
+
+       mutex_lock(&dev->mem_lock);
+       ret = -EEXIST;
+       if (dev->umem)
+               goto unlock;
+
+       ret = -ENOMEM;
+       npages = size >> PAGE_SHIFT;
+       page_list = __vmalloc(array_size(npages, sizeof(struct page *)),
+                             GFP_KERNEL_ACCOUNT);
+       umem = kzalloc(sizeof(*umem), GFP_KERNEL);
+       if (!page_list || !umem)
+               goto unlock;
+
+       mmap_read_lock(current->mm);
+
+       lock_limit = PFN_DOWN(rlimit(RLIMIT_MEMLOCK));
+       if (npages + atomic64_read(&current->mm->pinned_vm) > lock_limit)
+               goto out;
+
+       pinned = pin_user_pages(uaddr, npages, FOLL_LONGTERM | FOLL_WRITE,
+                               page_list, NULL);
+       if (pinned != npages) {
+               ret = pinned < 0 ? pinned : -ENOMEM;
+               goto out;
+       }
+
+       ret = vduse_domain_add_user_bounce_pages(dev->domain,
+                                                page_list, pinned);
+       if (ret)
+               goto out;
+
+       atomic64_add(npages, &current->mm->pinned_vm);
+
+       umem->pages = page_list;
+       umem->npages = pinned;
+       umem->iova = iova;
+       umem->mm = current->mm;
+       mmgrab(current->mm);
+
+       dev->umem = umem;
+out:
+       if (ret && pinned > 0)
+               unpin_user_pages(page_list, pinned);
+
+       mmap_read_unlock(current->mm);
+unlock:
+       if (ret) {
+               vfree(page_list);
+               kfree(umem);
+       }
+       mutex_unlock(&dev->mem_lock);
+       return ret;
+}
+
 static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
@@ -1089,6 +1196,38 @@ static long vduse_dev_ioctl(struct file *file, unsigned int cmd,
                ret = vduse_dev_queue_irq_work(dev, &dev->vqs[index].inject);
                break;
        }
+       case VDUSE_IOTLB_REG_UMEM: {
+               struct vduse_iova_umem umem;
+
+               ret = -EFAULT;
+               if (copy_from_user(&umem, argp, sizeof(umem)))
+                       break;
+
+               ret = -EINVAL;
+               if (!is_mem_zero((const char *)umem.reserved,
+                                sizeof(umem.reserved)))
+                       break;
+
+               ret = vduse_dev_reg_umem(dev, umem.iova,
+                                        umem.uaddr, umem.size);
+               break;
+       }
+       case VDUSE_IOTLB_DEREG_UMEM: {
+               struct vduse_iova_umem umem;
+
+               ret = -EFAULT;
+               if (copy_from_user(&umem, argp, sizeof(umem)))
+                       break;
+
+               ret = -EINVAL;
+               if (!is_mem_zero((const char *)umem.reserved,
+                                sizeof(umem.reserved)))
+                       break;
+
+               ret = vduse_dev_dereg_umem(dev, umem.iova,
+                                          umem.size);
+               break;
+       }
        default:
                ret = -ENOIOCTLCMD;
                break;
@@ -1101,6 +1240,7 @@ static int vduse_dev_release(struct inode *inode, struct file *file)
 {
        struct vduse_dev *dev = file->private_data;
 
+       vduse_dev_dereg_umem(dev, 0, dev->domain->bounce_size);
        spin_lock(&dev->msg_lock);
        /* Make sure the inflight messages can processed after reconncection */
        list_splice_init(&dev->recv_list, &dev->send_list);
@@ -1163,6 +1303,7 @@ static struct vduse_dev *vduse_dev_create(void)
                return NULL;
 
        mutex_init(&dev->lock);
+       mutex_init(&dev->mem_lock);
        spin_lock_init(&dev->msg_lock);
        INIT_LIST_HEAD(&dev->send_list);
        INIT_LIST_HEAD(&dev->recv_list);
index 7cfe1c1..9885e05 100644 (file)
@@ -210,6 +210,29 @@ struct vduse_vq_eventfd {
  */
 #define VDUSE_VQ_INJECT_IRQ    _IOW(VDUSE_BASE, 0x17, __u32)
 
+/**
+ * struct vduse_iova_umem - userspace memory configuration for one IOVA region
+ * @uaddr: start address of userspace memory, it must be aligned to page size
+ * @iova: start of the IOVA region
+ * @size: size of the IOVA region
+ * @reserved: for future use, needs to be initialized to zero
+ *
+ * Structure used by VDUSE_IOTLB_REG_UMEM and VDUSE_IOTLB_DEREG_UMEM
+ * ioctls to register/de-register userspace memory for IOVA regions
+ */
+struct vduse_iova_umem {
+       __u64 uaddr;
+       __u64 iova;
+       __u64 size;
+       __u64 reserved[3];
+};
+
+/* Register userspace memory for IOVA regions */
+#define VDUSE_IOTLB_REG_UMEM   _IOW(VDUSE_BASE, 0x18, struct vduse_iova_umem)
+
+/* De-register the userspace memory. Caller should set iova and size field. */
+#define VDUSE_IOTLB_DEREG_UMEM _IOW(VDUSE_BASE, 0x19, struct vduse_iova_umem)
+
 /* The control messages definition for read(2)/write(2) on /dev/vduse/$NAME */
 
 /**