Merge tag 'drm-next-2022-10-14' of git://anongit.freedesktop.org/drm/drm
[linux-2.6-microblaze.git] / fs / userfaultfd.c
index 0c1d33c..07c81ab 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/security.h>
 #include <linux/hugetlb.h>
 #include <linux/swapops.h>
+#include <linux/miscdevice.h>
 
 int sysctl_unprivileged_userfaultfd __read_mostly;
 
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
        if (ctx->features & UFFD_FEATURE_SIGBUS)
                goto out;
-       if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
-           ctx->flags & UFFD_USER_MODE_ONLY) {
-               printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
-                       "sysctl knob to 1 if kernel faults must be handled "
-                       "without obtaining CAP_SYS_PTRACE capability\n");
+       if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
                goto out;
-       }
 
        /*
         * If it's already released don't get it. This avoids to loop
@@ -615,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx,
        if (release_new_ctx) {
                struct vm_area_struct *vma;
                struct mm_struct *mm = release_new_ctx->mm;
+               VMA_ITERATOR(vmi, mm, 0);
 
                /* the various vma->vm_userfaultfd_ctx still points to it */
                mmap_write_lock(mm);
-               for (vma = mm->mmap; vma; vma = vma->vm_next)
+               for_each_vma(vmi, vma) {
                        if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
                                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
                                vma->vm_flags &= ~__VM_UFFD_FLAGS;
                        }
+               }
                mmap_write_unlock(mm);
 
                userfaultfd_ctx_put(release_new_ctx);
@@ -803,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps,
        return false;
 }
 
-int userfaultfd_unmap_prep(struct vm_area_struct *vma,
-                          unsigned long start, unsigned long end,
-                          struct list_head *unmaps)
+int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start,
+                          unsigned long end, struct list_head *unmaps)
 {
-       for ( ; vma && vma->vm_start < end; vma = vma->vm_next) {
+       VMA_ITERATOR(vmi, mm, start);
+       struct vm_area_struct *vma;
+
+       for_each_vma_range(vmi, vma, end) {
                struct userfaultfd_unmap_ctx *unmap_ctx;
                struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx;
 
@@ -857,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
        /* len == 0 means wake all */
        struct userfaultfd_wake_range range = { .len = 0, };
        unsigned long new_flags;
+       MA_STATE(mas, &mm->mm_mt, 0, 0);
 
        WRITE_ONCE(ctx->released, true);
 
@@ -873,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
         */
        mmap_write_lock(mm);
        prev = NULL;
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
+       mas_for_each(&mas, vma, ULONG_MAX) {
                cond_resched();
                BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
                       !!(vma->vm_flags & __VM_UFFD_FLAGS));
@@ -887,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
                                 vma->vm_file, vma->vm_pgoff,
                                 vma_policy(vma),
                                 NULL_VM_UFFD_CTX, anon_vma_name(vma));
-               if (prev)
+               if (prev) {
+                       mas_pause(&mas);
                        vma = prev;
-               else
+               } else {
                        prev = vma;
+               }
+
                vma->vm_flags = new_flags;
                vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
        }
@@ -1272,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        bool found;
        bool basic_ioctls;
        unsigned long start, end, vma_end;
+       MA_STATE(mas, &mm->mm_mt, 0, 0);
 
        user_uffdio_register = (struct uffdio_register __user *) arg;
 
@@ -1314,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                goto out;
 
        mmap_write_lock(mm);
-       vma = find_vma_prev(mm, start, &prev);
+       mas_set(&mas, start);
+       vma = mas_find(&mas, ULONG_MAX);
        if (!vma)
                goto out_unlock;
 
@@ -1339,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
         */
        found = false;
        basic_ioctls = false;
-       for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+       for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1399,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        }
        BUG_ON(!found);
 
-       if (vma->vm_start < start)
-               prev = vma;
+       mas_set(&mas, start);
+       prev = mas_prev(&mas, 0);
+       if (prev != vma)
+               mas_next(&mas, ULONG_MAX);
 
        ret = 0;
        do {
@@ -1430,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                                 ((struct vm_userfaultfd_ctx){ ctx }),
                                 anon_vma_name(vma));
                if (prev) {
+                       /* vma_merge() invalidated the mas */
+                       mas_pause(&mas);
                        vma = prev;
                        goto next;
                }
@@ -1437,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                        ret = split_vma(mm, vma, start, 1);
                        if (ret)
                                break;
+                       /* split_vma() invalidated the mas */
+                       mas_pause(&mas);
                }
                if (vma->vm_end > end) {
                        ret = split_vma(mm, vma, end, 0);
                        if (ret)
                                break;
+                       /* split_vma() invalidated the mas */
+                       mas_pause(&mas);
                }
        next:
                /*
@@ -1458,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
        skip:
                prev = vma;
                start = vma->vm_end;
-               vma = vma->vm_next;
-       } while (vma && vma->vm_start < end);
+               vma = mas_next(&mas, end - 1);
+       } while (vma);
 out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
@@ -1503,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        bool found;
        unsigned long start, end, vma_end;
        const void __user *buf = (void __user *)arg;
+       MA_STATE(mas, &mm->mm_mt, 0, 0);
 
        ret = -EFAULT;
        if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1521,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                goto out;
 
        mmap_write_lock(mm);
-       vma = find_vma_prev(mm, start, &prev);
+       mas_set(&mas, start);
+       vma = mas_find(&mas, ULONG_MAX);
        if (!vma)
                goto out_unlock;
 
@@ -1546,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
         */
        found = false;
        ret = -EINVAL;
-       for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) {
+       for (cur = vma; cur; cur = mas_next(&mas, end - 1)) {
                cond_resched();
 
                BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^
@@ -1566,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        }
        BUG_ON(!found);
 
-       if (vma->vm_start < start)
-               prev = vma;
+       mas_set(&mas, start);
+       prev = mas_prev(&mas, 0);
+       if (prev != vma)
+               mas_next(&mas, ULONG_MAX);
 
        ret = 0;
        do {
@@ -1636,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
        skip:
                prev = vma;
                start = vma->vm_end;
-               vma = vma->vm_next;
-       } while (vma && vma->vm_start < end);
+               vma = mas_next(&mas, end - 1);
+       } while (vma);
 out_unlock:
        mmap_write_unlock(mm);
        mmput(mm);
@@ -2056,20 +2074,11 @@ static void init_once_userfaultfd_ctx(void *mem)
        seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
 }
 
-SYSCALL_DEFINE1(userfaultfd, int, flags)
+static int new_userfaultfd(int flags)
 {
        struct userfaultfd_ctx *ctx;
        int fd;
 
-       if (!sysctl_unprivileged_userfaultfd &&
-           (flags & UFFD_USER_MODE_ONLY) == 0 &&
-           !capable(CAP_SYS_PTRACE)) {
-               printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
-                       "sysctl knob to 1 if kernel faults must be handled "
-                       "without obtaining CAP_SYS_PTRACE capability\n");
-               return -EPERM;
-       }
-
        BUG_ON(!current->mm);
 
        /* Check the UFFD_* constants for consistency.  */
@@ -2102,8 +2111,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
        return fd;
 }
 
+static inline bool userfaultfd_syscall_allowed(int flags)
+{
+       /* Userspace-only page faults are always allowed */
+       if (flags & UFFD_USER_MODE_ONLY)
+               return true;
+
+       /*
+        * The user is requesting a userfaultfd which can handle kernel faults.
+        * Privileged users are always allowed to do this.
+        */
+       if (capable(CAP_SYS_PTRACE))
+               return true;
+
+       /* Otherwise, access to kernel fault handling is sysctl controlled. */
+       return sysctl_unprivileged_userfaultfd;
+}
+
+SYSCALL_DEFINE1(userfaultfd, int, flags)
+{
+       if (!userfaultfd_syscall_allowed(flags))
+               return -EPERM;
+
+       return new_userfaultfd(flags);
+}
+
+static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
+{
+       if (cmd != USERFAULTFD_IOC_NEW)
+               return -EINVAL;
+
+       return new_userfaultfd(flags);
+}
+
+static const struct file_operations userfaultfd_dev_fops = {
+       .unlocked_ioctl = userfaultfd_dev_ioctl,
+       .compat_ioctl = userfaultfd_dev_ioctl,
+       .owner = THIS_MODULE,
+       .llseek = noop_llseek,
+};
+
+static struct miscdevice userfaultfd_misc = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "userfaultfd",
+       .fops = &userfaultfd_dev_fops
+};
+
 static int __init userfaultfd_init(void)
 {
+       int ret;
+
+       ret = misc_register(&userfaultfd_misc);
+       if (ret)
+               return ret;
+
        userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
                                                sizeof(struct userfaultfd_ctx),
                                                0,