Merge tag 'io_uring-5.9-2020-10-02' of git://git.kernel.dk/linux-block

author Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)
diff --combined fs/io_uring.c

index eed0d06,b89cc87..aae0ef2
--- 1/fs/io_uring.c
--- 2/fs/io_uring.c
+++ b/fs/io_uring.c
@@@ -2569,7 -2569,7 +2569,7 @@@ static inline void io_rw_done(struct ki
                  * IO with EINTR.
                  */
                 ret = -EINTR;
- -              /* fall through */
+ +              fallthrough;
         default:
                 kiocb->ki_complete(kiocb, ret, 0);
         }
@@@ -3049,6 -3049,7 +3049,7 @@@ static int io_async_buf_func(struct wai
         if (!wake_page_match(wpq, key))
                 return 0;
   
+       req->rw.kiocb.ki_flags &= ~IOCB_WAITQ;
         list_del_init(&wait->entry);
   
         init_task_work(&req->task_work, io_req_task_submit);
@@@ -3106,6 -3107,7 +3107,7 @@@ static bool io_rw_should_retry(struct i
         wait->wait.flags = 0;
         INIT_LIST_HEAD(&wait->wait.entry);
         kiocb->ki_flags |= IOCB_WAITQ;
+       kiocb->ki_flags &= ~IOCB_NOWAIT;
         kiocb->ki_waitq = wait;
   
         io_get_req_task(req);
@@@ -4743,6 -4745,8 +4745,8 @@@ static int io_poll_double_wake(struct w
         if (mask && !(mask & poll->events))
                 return 0;
   
+       list_del_init(&wait->entry);
+ 
         if (poll && poll->head) {
                 bool done;
   
@@@ -8412,11 -8416,19 +8416,19 @@@ static int io_uring_show_cred(int id, v
   
   static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m)
   {
+       bool has_lock;
         int i;
   
-       mutex_lock(&ctx->uring_lock);
+       /*
+        * Avoid ABBA deadlock between the seq lock and the io_uring mutex,
+        * since fdinfo case grabs it in the opposite direction of normal use
+        * cases. If we fail to get the lock, we just don't iterate any
+        * structures that could be going away outside the io_uring mutex.
+        */
+       has_lock = mutex_trylock(&ctx->uring_lock);
+ 
         seq_printf(m, "UserFiles:\t%u\n", ctx->nr_user_files);
-       for (i = 0; i < ctx->nr_user_files; i++) {
+       for (i = 0; has_lock && i < ctx->nr_user_files; i++) {
                 struct fixed_file_table *table;
                 struct file *f;
   
@@@ -8428,13 -8440,13 +8440,13 @@@
                         seq_printf(m, "%5u: <none>\n", i);
         }
         seq_printf(m, "UserBufs:\t%u\n", ctx->nr_user_bufs);
-       for (i = 0; i < ctx->nr_user_bufs; i++) {
+       for (i = 0; has_lock && i < ctx->nr_user_bufs; i++) {
                 struct io_mapped_ubuf *buf = &ctx->user_bufs[i];
   
                 seq_printf(m, "%5u: 0x%llx/%u\n", i, buf->ubuf,
                                                 (unsigned int) buf->len);
         }
-       if (!idr_is_empty(&ctx->personality_idr)) {
+       if (has_lock && !idr_is_empty(&ctx->personality_idr)) {
                 seq_printf(m, "Personalities:\n");
                 idr_for_each(&ctx->personality_idr, io_uring_show_cred, m);
         }
@@@ -8449,7 -8461,8 +8461,8 @@@
                                         req->task->task_works != NULL);
         }
         spin_unlock_irq(&ctx->completion_lock);
-       mutex_unlock(&ctx->uring_lock);
+       if (has_lock)
+               mutex_unlock(&ctx->uring_lock);
   }
   
   static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
diff --combined mm/filemap.c

index 5202e38,ea38347..99c49ee
--- 1/mm/filemap.c
--- 2/mm/filemap.c
+++ b/mm/filemap.c
@@@ -988,43 -988,9 +988,43 @@@ void __init pagecache_init(void
         page_writeback_init();
   }
   
+ +/*
+ + * The page wait code treats the "wait->flags" somewhat unusually, because
+ + * we have multiple different kinds of waits, not just the usual "exclusive"
+ + * one.
+ + *
+ + * We have:
+ + *
+ + *  (a) no special bits set:
+ + *
+ + *    We're just waiting for the bit to be released, and when a waker
+ + *    calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ + *    and remove it from the wait queue.
+ + *
+ + *    Simple and straightforward.
+ + *
+ + *  (b) WQ_FLAG_EXCLUSIVE:
+ + *
+ + *    The waiter is waiting to get the lock, and only one waiter should
+ + *    be woken up to avoid any thundering herd behavior. We'll set the
+ + *    WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ + *
+ + *    This is the traditional exclusive wait.
+ + *
+ + *  (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ + *
+ + *    The waiter is waiting to get the bit, and additionally wants the
+ + *    lock to be transferred to it for fair lock behavior. If the lock
+ + *    cannot be taken, we stop walking the wait queue without waking
+ + *    the waiter.
+ + *
+ + *    This is the "fair lock handoff" case, and in addition to setting
+ + *    WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ + *    that it now has the lock.
+ + */
   static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
   {
- -      int ret;
+ +      unsigned int flags;
         struct wait_page_key *key = arg;
         struct wait_page_queue *wait_page
                 = container_of(wait, struct wait_page_queue, wait);
@@@ -1033,44 -999,35 +1033,44 @@@
                 return 0;
   
         /*
- -       * If it's an exclusive wait, we get the bit for it, and
- -       * stop walking if we can't.
- -       *
- -       * If it's a non-exclusive wait, then the fact that this
- -       * wake function was called means that the bit already
- -       * was cleared, and we don't care if somebody then
- -       * re-took it.
+ +       * If it's a lock handoff wait, we get the bit for it, and
+ +       * stop walking (and do not wake it up) if we can't.
          */
- -      ret = 0;
- -      if (wait->flags & WQ_FLAG_EXCLUSIVE) {
- -              if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ +      flags = wait->flags;
+ +      if (flags & WQ_FLAG_EXCLUSIVE) {
+ +              if (test_bit(key->bit_nr, &key->page->flags))
                         return -1;
- -              ret = 1;
+ +              if (flags & WQ_FLAG_CUSTOM) {
+ +                      if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ +                              return -1;
+ +                      flags |= WQ_FLAG_DONE;
+ +              }
         }
- -      wait->flags |= WQ_FLAG_WOKEN;
   
+ +      /*
+ +       * We are holding the wait-queue lock, but the waiter that
+ +       * is waiting for this will be checking the flags without
+ +       * any locking.
+ +       *
+ +       * So update the flags atomically, and wake up the waiter
+ +       * afterwards to avoid any races. This store-release pairs
+ +       * with the load-acquire in wait_on_page_bit_common().
+ +       */
+ +      smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
         wake_up_state(wait->private, mode);
   
         /*
          * Ok, we have successfully done what we're waiting for,
          * and we can unconditionally remove the wait entry.
          *
- -       * Note that this has to be the absolute last thing we do,
- -       * since after list_del_init(&wait->entry) the wait entry
+ +       * Note that this pairs with the "finish_wait()" in the
+ +       * waiter, and has to be the absolute last thing we do.
+ +       * After this list_del_init(&wait->entry) the wait entry
          * might be de-allocated and the process might even have
          * exited.
          */
         list_del_init_careful(&wait->entry);
- -      return ret;
+ +      return (flags & WQ_FLAG_EXCLUSIVE) != 0;
   }
   
   static void wake_up_page_bit(struct page *page, int bit_nr)
@@@ -1150,8 -1107,8 +1150,8 @@@ enum behavior 
   };
   
   /*
- - * Attempt to check (or get) the page bit, and mark the
- - * waiter woken if successful.
+ + * Attempt to check (or get) the page bit, and mark us done
+ + * if successful.
    */
   static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
                                         struct wait_queue_entry *wait)
@@@ -1162,17 -1119,13 +1162,17 @@@
         } else if (test_bit(bit_nr, &page->flags))
                 return false;
   
- -      wait->flags |= WQ_FLAG_WOKEN;
+ +      wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
         return true;
   }
   
+ +/* How many times do we accept lock stealing from under a waiter? */
+ +int sysctl_page_lock_unfairness = 5;
+ +
   static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         struct page *page, int bit_nr, int state, enum behavior behavior)
   {
+ +      int unfairness = sysctl_page_lock_unfairness;
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
         bool thrashing = false;
@@@ -1190,18 -1143,11 +1190,18 @@@
         }
   
         init_wait(wait);
- -      wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
         wait->func = wake_page_function;
         wait_page.page = page;
         wait_page.bit_nr = bit_nr;
   
+ +repeat:
+ +      wait->flags = 0;
+ +      if (behavior == EXCLUSIVE) {
+ +              wait->flags = WQ_FLAG_EXCLUSIVE;
+ +              if (--unfairness < 0)
+ +                      wait->flags |= WQ_FLAG_CUSTOM;
+ +      }
+ +
         /*
          * Do one last check whether we can get the
          * page bit synchronously.
@@@ -1224,63 -1170,27 +1224,63 @@@
   
         /*
          * From now on, all the logic will be based on
- -       * the WQ_FLAG_WOKEN flag, and the and the page
- -       * bit testing (and setting) will be - or has
- -       * already been - done by the wake function.
+ +       * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ +       * see whether the page bit testing has already
+ +       * been done by the wake function.
          *
          * We can drop our reference to the page.
          */
         if (behavior == DROP)
                 put_page(page);
   
+ +      /*
+ +       * Note that until the "finish_wait()", or until
+ +       * we see the WQ_FLAG_WOKEN flag, we need to
+ +       * be very careful with the 'wait->flags', because
+ +       * we may race with a waker that sets them.
+ +       */
         for (;;) {
+ +              unsigned int flags;
+ +
                 set_current_state(state);
   
- -              if (signal_pending_state(state, current))
+ +              /* Loop until we've been woken or interrupted */
+ +              flags = smp_load_acquire(&wait->flags);
+ +              if (!(flags & WQ_FLAG_WOKEN)) {
+ +                      if (signal_pending_state(state, current))
+ +                              break;
+ +
+ +                      io_schedule();
+ +                      continue;
+ +              }
+ +
+ +              /* If we were non-exclusive, we're done */
+ +              if (behavior != EXCLUSIVE)
                         break;
   
- -              if (wait->flags & WQ_FLAG_WOKEN)
+ +              /* If the waker got the lock for us, we're done */
+ +              if (flags & WQ_FLAG_DONE)
                         break;
   
- -              io_schedule();
+ +              /*
+ +               * Otherwise, if we're getting the lock, we need to
+ +               * try to get it ourselves.
+ +               *
+ +               * And if that fails, we'll have to retry this all.
+ +               */
+ +              if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ +                      goto repeat;
+ +
+ +              wait->flags |= WQ_FLAG_DONE;
+ +              break;
         }
   
+ +      /*
+ +       * If a signal happened, this 'finish_wait()' may remove the last
+ +       * waiter from the wait-queues, but the PageWaiters bit will remain
+ +       * set. That's ok. The next wakeup will take care of it, and trying
+ +       * to do it here would be difficult and prone to races.
+ +       */
         finish_wait(q, wait);
   
         if (thrashing) {
@@@ -1290,20 -1200,12 +1290,20 @@@
         }
   
         /*
- -       * A signal could leave PageWaiters set. Clearing it here if
- -       * !waitqueue_active would be possible (by open-coding finish_wait),
- -       * but still fail to catch it in the case of wait hash collision. We
- -       * already can fail to clear wait hash collision cases, so don't
- -       * bother with signals either.
+ +       * NOTE! The wait->flags weren't stable until we've done the
+ +       * 'finish_wait()', and we could have exited the loop above due
+ +       * to a signal, and had a wakeup event happen after the signal
+ +       * test but before the 'finish_wait()'.
+ +       *
+ +       * So only after the finish_wait() can we reliably determine
+ +       * if we got woken up or not, so we can now figure out the final
+ +       * return value based on that state without races.
+ +       *
+ +       * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ +       * waiter, but an exclusive one requires WQ_FLAG_DONE.
          */
+ +      if (behavior == EXCLUSIVE)
+ +              return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
   
         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
   }
@@@ -2365,7 -2267,11 +2365,11 @@@ readpage
                 }
   
                 if (!PageUptodate(page)) {
-                       error = lock_page_killable(page);
+                       if (iocb->ki_flags & IOCB_WAITQ)
+                               error = lock_page_async(page, iocb->ki_waitq);
+                       else
+                               error = lock_page_killable(page);
+ 
                         if (unlikely(error))
                                 goto readpage_error;
                         if (!PageUptodate(page)) {
author	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 2 Oct 2020 21:38:10 +0000 (14:38 -0700)
		1	2
fs/io_uring.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/filemap.c	patch \|	diff1 \|	diff2 \|	blob \| history