userfaultfd: prevent concurrent API initialization
authorNadav Amit <namit@vmware.com>
Thu, 2 Sep 2021 21:58:59 +0000 (14:58 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 3 Sep 2021 16:58:16 +0000 (09:58 -0700)
userfaultfd assumes that the enabled features are set once and never
changed after UFFDIO_API ioctl succeeded.

However, currently, UFFDIO_API can be called concurrently from two
different threads, succeed on both threads and leave userfaultfd's
features in non-deterministic state.  Theoretically, other uffd operations
(ioctl's and page-faults) can be dispatched while adversely affected by
such changes of features.

Moreover, the writes to ctx->state and ctx->features are not ordered,
which can - theoretically, again - let userfaultfd_ioctl() think that
userfaultfd API completed, while the features are still not initialized.

To avoid races, it is arguably best to get rid of ctx->state.  Since there
are only 2 states, record the API initialization in ctx->features as the
uppermost bit and remove ctx->state.

Link: https://lkml.kernel.org/r/20210808020724.1022515-3-namit@vmware.com
Fixes: 9cd75c3cd4c3d ("userfaultfd: non-cooperative: add ability to report non-PF events from uffd descriptor")
Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Axel Rasmussen <axelrasmussen@google.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
Cc: Peter Xu <peterx@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/userfaultfd.c

index 29a3016..003f0d3 100644 (file)
@@ -33,11 +33,6 @@ int sysctl_unprivileged_userfaultfd __read_mostly;
 
 static struct kmem_cache *userfaultfd_ctx_cachep __read_mostly;
 
-enum userfaultfd_state {
-       UFFD_STATE_WAIT_API,
-       UFFD_STATE_RUNNING,
-};
-
 /*
  * Start with fault_pending_wqh and fault_wqh so they're more likely
  * to be in the same cacheline.
@@ -69,8 +64,6 @@ struct userfaultfd_ctx {
        unsigned int flags;
        /* features requested from the userspace */
        unsigned int features;
-       /* state machine */
-       enum userfaultfd_state state;
        /* released */
        bool released;
        /* memory mappings are changing because of non-cooperative event */
@@ -104,6 +97,14 @@ struct userfaultfd_wake_range {
        unsigned long len;
 };
 
+/* internal indication that UFFD_API ioctl was successfully executed */
+#define UFFD_FEATURE_INITIALIZED               (1u << 31)
+
+static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
+{
+       return ctx->features & UFFD_FEATURE_INITIALIZED;
+}
+
 static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
                                     int wake_flags, void *key)
 {
@@ -667,7 +668,6 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
 
                refcount_set(&ctx->refcount, 1);
                ctx->flags = octx->flags;
-               ctx->state = UFFD_STATE_RUNNING;
                ctx->features = octx->features;
                ctx->released = false;
                atomic_set(&ctx->mmap_changing, 0);
@@ -944,38 +944,33 @@ static __poll_t userfaultfd_poll(struct file *file, poll_table *wait)
 
        poll_wait(file, &ctx->fd_wqh, wait);
 
-       switch (ctx->state) {
-       case UFFD_STATE_WAIT_API:
+       if (!userfaultfd_is_initialized(ctx))
                return EPOLLERR;
-       case UFFD_STATE_RUNNING:
-               /*
-                * poll() never guarantees that read won't block.
-                * userfaults can be waken before they're read().
-                */
-               if (unlikely(!(file->f_flags & O_NONBLOCK)))
-                       return EPOLLERR;
-               /*
-                * lockless access to see if there are pending faults
-                * __pollwait last action is the add_wait_queue but
-                * the spin_unlock would allow the waitqueue_active to
-                * pass above the actual list_add inside
-                * add_wait_queue critical section. So use a full
-                * memory barrier to serialize the list_add write of
-                * add_wait_queue() with the waitqueue_active read
-                * below.
-                */
-               ret = 0;
-               smp_mb();
-               if (waitqueue_active(&ctx->fault_pending_wqh))
-                       ret = EPOLLIN;
-               else if (waitqueue_active(&ctx->event_wqh))
-                       ret = EPOLLIN;
 
-               return ret;
-       default:
-               WARN_ON_ONCE(1);
+       /*
+        * poll() never guarantees that read won't block.
+        * userfaults can be waken before they're read().
+        */
+       if (unlikely(!(file->f_flags & O_NONBLOCK)))
                return EPOLLERR;
-       }
+       /*
+        * lockless access to see if there are pending faults
+        * __pollwait last action is the add_wait_queue but
+        * the spin_unlock would allow the waitqueue_active to
+        * pass above the actual list_add inside
+        * add_wait_queue critical section. So use a full
+        * memory barrier to serialize the list_add write of
+        * add_wait_queue() with the waitqueue_active read
+        * below.
+        */
+       ret = 0;
+       smp_mb();
+       if (waitqueue_active(&ctx->fault_pending_wqh))
+               ret = EPOLLIN;
+       else if (waitqueue_active(&ctx->event_wqh))
+               ret = EPOLLIN;
+
+       return ret;
 }
 
 static const struct file_operations userfaultfd_fops;
@@ -1170,7 +1165,7 @@ static ssize_t userfaultfd_read(struct file *file, char __user *buf,
        int no_wait = file->f_flags & O_NONBLOCK;
        struct inode *inode = file_inode(file);
 
-       if (ctx->state == UFFD_STATE_WAIT_API)
+       if (!userfaultfd_is_initialized(ctx))
                return -EINVAL;
 
        for (;;) {
@@ -1909,9 +1904,10 @@ out:
 static inline unsigned int uffd_ctx_features(__u64 user_features)
 {
        /*
-        * For the current set of features the bits just coincide
+        * For the current set of features the bits just coincide. Set
+        * UFFD_FEATURE_INITIALIZED to mark the features as enabled.
         */
-       return (unsigned int)user_features;
+       return (unsigned int)user_features | UFFD_FEATURE_INITIALIZED;
 }
 
 /*
@@ -1924,12 +1920,10 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
 {
        struct uffdio_api uffdio_api;
        void __user *buf = (void __user *)arg;
+       unsigned int ctx_features;
        int ret;
        __u64 features;
 
-       ret = -EINVAL;
-       if (ctx->state != UFFD_STATE_WAIT_API)
-               goto out;
        ret = -EFAULT;
        if (copy_from_user(&uffdio_api, buf, sizeof(uffdio_api)))
                goto out;
@@ -1953,9 +1947,13 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
        ret = -EFAULT;
        if (copy_to_user(buf, &uffdio_api, sizeof(uffdio_api)))
                goto out;
-       ctx->state = UFFD_STATE_RUNNING;
+
        /* only enable the requested features for this uffd context */
-       ctx->features = uffd_ctx_features(features);
+       ctx_features = uffd_ctx_features(features);
+       ret = -EINVAL;
+       if (cmpxchg(&ctx->features, 0, ctx_features) != 0)
+               goto err_out;
+
        ret = 0;
 out:
        return ret;
@@ -1972,7 +1970,7 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
        int ret = -EINVAL;
        struct userfaultfd_ctx *ctx = file->private_data;
 
-       if (cmd != UFFDIO_API && ctx->state == UFFD_STATE_WAIT_API)
+       if (cmd != UFFDIO_API && !userfaultfd_is_initialized(ctx))
                return -EINVAL;
 
        switch(cmd) {
@@ -2086,7 +2084,6 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
        refcount_set(&ctx->refcount, 1);
        ctx->flags = flags;
        ctx->features = 0;
-       ctx->state = UFFD_STATE_WAIT_API;
        ctx->released = false;
        atomic_set(&ctx->mmap_changing, 0);
        ctx->mm = current->mm;