aio: keep poll requests on waitqueue until completed

author Eric Biggers <ebiggers@google.com>

Thu, 9 Dec 2021 01:04:54 +0000 (17:04 -0800)

committer Eric Biggers <ebiggers@google.com>

Thu, 9 Dec 2021 18:49:56 +0000 (10:49 -0800)
author Eric Biggers <ebiggers@google.com>
Thu, 9 Dec 2021 01:04:54 +0000 (17:04 -0800)
committer Eric Biggers <ebiggers@google.com>
Thu, 9 Dec 2021 18:49:56 +0000 (10:49 -0800)
diff --git a/fs/aio.c b/fs/aio.c

index 9c81cf6..2bc1352 100644 (file)
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -181,8 +181,9 @@ struct poll_iocb {
         struct file             *file;
         struct wait_queue_head  *head;
         __poll_t                events;
-       bool                    done;
         bool                    cancelled;
+       bool                    work_scheduled;
+       bool                    work_need_resched;
         struct wait_queue_entry wait;
         struct work_struct      work;
  };
@@ -1638,14 +1639,26 @@ static void aio_poll_complete_work(struct work_struct *work)
          * avoid further branches in the fast path.
          */
         spin_lock_irq(&ctx->ctx_lock);
+       spin_lock(&req->head->lock);
         if (!mask && !READ_ONCE(req->cancelled)) {
-               add_wait_queue(req->head, &req->wait);
+               /*
+                * The request isn't actually ready to be completed yet.
+                * Reschedule completion if another wakeup came in.
+                */
+               if (req->work_need_resched) {
+                       schedule_work(&req->work);
+                       req->work_need_resched = false;
+               } else {
+                       req->work_scheduled = false;
+               }
+               spin_unlock(&req->head->lock);
                 spin_unlock_irq(&ctx->ctx_lock);
                 return;
         }
+       list_del_init(&req->wait.entry);
+       spin_unlock(&req->head->lock);
         list_del_init(&iocb->ki_list);
         iocb->ki_res.res = mangle_poll(mask);
-       req->done = true;
         spin_unlock_irq(&ctx->ctx_lock);
  
         iocb_put(iocb);
@@ -1659,9 +1672,9 @@ static int aio_poll_cancel(struct kiocb *iocb)
  
         spin_lock(&req->head->lock);
         WRITE_ONCE(req->cancelled, true);
-       if (!list_empty(&req->wait.entry)) {
-               list_del_init(&req->wait.entry);
+       if (!req->work_scheduled) {
                 schedule_work(&aiocb->poll.work);
+               req->work_scheduled = true;
         }
         spin_unlock(&req->head->lock);
  
@@ -1680,20 +1693,26 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
         if (mask && !(mask & req->events))
                 return 0;
  
-       list_del_init(&req->wait.entry);
-
-       if (mask && spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
+       /*
+        * Complete the request inline if possible.  This requires that three
+        * conditions be met:
+        *   1. An event mask must have been passed.  If a plain wakeup was done
+        *      instead, then mask == 0 and we have to call vfs_poll() to get
+        *      the events, so inline completion isn't possible.
+        *   2. The completion work must not have already been scheduled.
+        *   3. ctx_lock must not be busy.  We have to use trylock because we
+        *      already hold the waitqueue lock, so this inverts the normal
+        *      locking order.  Use irqsave/irqrestore because not all
+        *      filesystems (e.g. fuse) call this function with IRQs disabled,
+        *      yet IRQs have to be disabled before ctx_lock is obtained.
+        */
+       if (mask && !req->work_scheduled &&
+           spin_trylock_irqsave(&iocb->ki_ctx->ctx_lock, flags)) {
                 struct kioctx *ctx = iocb->ki_ctx;
  
-               /*
-                * Try to complete the iocb inline if we can. Use
-                * irqsave/irqrestore because not all filesystems (e.g. fuse)
-                * call this function with IRQs disabled and because IRQs
-                * have to be disabled before ctx_lock is obtained.
-                */
+               list_del_init(&req->wait.entry);
                 list_del(&iocb->ki_list);
                 iocb->ki_res.res = mangle_poll(mask);
-               req->done = true;
                 if (iocb->ki_eventfd && eventfd_signal_allowed()) {
                         iocb = NULL;
                         INIT_WORK(&req->work, aio_poll_put_work);
@@ -1703,7 +1722,20 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync,
                 if (iocb)
                         iocb_put(iocb);
         } else {
-               schedule_work(&req->work);
+               /*
+                * Schedule the completion work if needed.  If it was already
+                * scheduled, record that another wakeup came in.
+                *
+                * Don't remove the request from the waitqueue here, as it might
+                * not actually be complete yet (we won't know until vfs_poll()
+                * is called), and we must not miss any wakeups.
+                */
+               if (req->work_scheduled) {
+                       req->work_need_resched = true;
+               } else {
+                       schedule_work(&req->work);
+                       req->work_scheduled = true;
+               }
         }
         return 1;
  }
@@ -1750,8 +1782,9 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
         req->events = demangle_poll(iocb->aio_buf) | EPOLLERR | EPOLLHUP;
  
         req->head = NULL;
-       req->done = false;
         req->cancelled = false;
+       req->work_scheduled = false;
+       req->work_need_resched = false;
  
         apt.pt._qproc = aio_poll_queue_proc;
         apt.pt._key = req->events;
@@ -1766,17 +1799,27 @@ static int aio_poll(struct aio_kiocb *aiocb, const struct iocb *iocb)
         spin_lock_irq(&ctx->ctx_lock);
         if (likely(req->head)) {
                 spin_lock(&req->head->lock);
-               if (unlikely(list_empty(&req->wait.entry))) {
-                       if (apt.error)
+               if (list_empty(&req->wait.entry) || req->work_scheduled) {
+                       /*
+                        * aio_poll_wake() already either scheduled the async
+                        * completion work, or completed the request inline.
+                        */
+                       if (apt.error) /* unsupported case: multiple queues */
                                 cancel = true;
                         apt.error = 0;
                         mask = 0;
                 }
                 if (mask || apt.error) {
+                       /* Steal to complete synchronously. */
                         list_del_init(&req->wait.entry);
                 } else if (cancel) {
+                       /* Cancel if possible (may be too late though). */
                         WRITE_ONCE(req->cancelled, true);
-               } else if (!req->done) { /* actually waiting for an event */
+               } else if (!list_empty(&req->wait.entry)) {
+                       /*
+                        * Actually waiting for an event, so add the request to
+                        * active_reqs so that it can be cancelled if needed.
+                        */
                         list_add_tail(&aiocb->ki_list, &ctx->active_reqs);
                         aiocb->ki_cancel = aio_poll_cancel;
                 }
author	Eric Biggers <ebiggers@google.com>
	Thu, 9 Dec 2021 01:04:54 +0000 (17:04 -0800)
committer	Eric Biggers <ebiggers@google.com>
	Thu, 9 Dec 2021 18:49:56 +0000 (10:49 -0800)