mm: allow a controlled amount of unfairness in the page lock

author Linus Torvalds <torvalds@linux-foundation.org>

Sun, 13 Sep 2020 21:05:35 +0000 (14:05 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 17 Sep 2020 17:26:41 +0000 (10:26 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Sun, 13 Sep 2020 21:05:35 +0000 (14:05 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 17 Sep 2020 17:26:41 +0000 (10:26 -0700)
diff --git a/include/linux/mm.h b/include/linux/mm.h

index ca6e6a8..b2f370f 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -41,6 +41,8 @@ struct writeback_control;
  struct bdi_writeback;
  struct pt_regs;
  
+extern int sysctl_page_lock_unfairness;
+
  void init_mm_internals(void);
  
  #ifndef CONFIG_NEED_MULTIPLE_NODES     /* Don't use mapnrs, do it properly */
diff --git a/include/linux/wait.h b/include/linux/wait.h

index 898c890..27fb99c 100644 (file)
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -21,6 +21,7 @@ int default_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, int
  #define WQ_FLAG_WOKEN          0x02
  #define WQ_FLAG_BOOKMARK       0x04
  #define WQ_FLAG_CUSTOM         0x08
+#define WQ_FLAG_DONE           0x10
  
  /*
   * A single wait-queue entry structure:
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 09e70ee..afad085 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2912,6 +2912,14 @@ static struct ctl_table vm_table[] = {
                 .proc_handler   = percpu_pagelist_fraction_sysctl_handler,
                 .extra1         = SYSCTL_ZERO,
         },
+       {
+               .procname       = "page_lock_unfairness",
+               .data           = &sysctl_page_lock_unfairness,
+               .maxlen         = sizeof(sysctl_page_lock_unfairness),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec_minmax,
+               .extra1         = SYSCTL_ZERO,
+       },
  #ifdef CONFIG_MMU
         {
                 .procname       = "max_map_count",
diff --git a/mm/filemap.c b/mm/filemap.c

index 1aaea26..6aa08e7 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -988,9 +988,43 @@ void __init pagecache_init(void)
         page_writeback_init();
  }
  
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just he usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ *  (a) no special bits set:
+ *
+ *     We're just waiting for the bit to be released, and when a waker
+ *     calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ *     and remove it from the wait queue.
+ *
+ *     Simple and straightforward.
+ *
+ *  (b) WQ_FLAG_EXCLUSIVE:
+ *
+ *     The waiter is waiting to get the lock, and only one waiter should
+ *     be woken up to avoid any thundering herd behavior. We'll set the
+ *     WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ *     This is the traditional exclusive wait.
+ *
+ *  (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ *     The waiter is waiting to get the bit, and additionally wants the
+ *     lock to be transferred to it for fair lock behavior. If the lock
+ *     cannot be taken, we stop walking the wait queue without waking
+ *     the waiter.
+ *
+ *     This is the "fair lock handoff" case, and in addition to setting
+ *     WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ *     that it now has the lock.
+ */
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
  {
-       int ret;
+       unsigned int flags;
         struct wait_page_key *key = arg;
         struct wait_page_queue *wait_page
                 = container_of(wait, struct wait_page_queue, wait);
@@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
                 return 0;
  
         /*
-        * If it's an exclusive wait, we get the bit for it, and
-        * stop walking if we can't.
-        *
-        * If it's a non-exclusive wait, then the fact that this
-        * wake function was called means that the bit already
-        * was cleared, and we don't care if somebody then
-        * re-took it.
+        * If it's a lock handoff wait, we get the bit for it, and
+        * stop walking (and do not wake it up) if we can't.
          */
-       ret = 0;
-       if (wait->flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_and_set_bit(key->bit_nr, &key->page->flags))
+       flags = wait->flags;
+       if (flags & WQ_FLAG_EXCLUSIVE) {
+               if (test_bit(key->bit_nr, &key->page->flags))
                         return -1;
-               ret = 1;
+               if (flags & WQ_FLAG_CUSTOM) {
+                       if (test_and_set_bit(key->bit_nr, &key->page->flags))
+                               return -1;
+                       flags |= WQ_FLAG_DONE;
+               }
         }
-       wait->flags |= WQ_FLAG_WOKEN;
  
+       /*
+        * We are holding the wait-queue lock, but the waiter that
+        * is waiting for this will be checking the flags without
+        * any locking.
+        *
+        * So update the flags atomically, and wake up the waiter
+        * afterwards to avoid any races. This store-release pairs
+        * with the load-acquire in wait_on_page_bit_common().
+        */
+       smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
         wake_up_state(wait->private, mode);
  
         /*
          * Ok, we have successfully done what we're waiting for,
          * and we can unconditionally remove the wait entry.
          *
-        * Note that this has to be the absolute last thing we do,
-        * since after list_del_init(&wait->entry) the wait entry
+        * Note that this pairs with the "finish_wait()" in the
+        * waiter, and has to be the absolute last thing we do.
+        * After this list_del_init(&wait->entry) the wait entry
          * might be de-allocated and the process might even have
          * exited.
          */
         list_del_init_careful(&wait->entry);
-       return ret;
+       return (flags & WQ_FLAG_EXCLUSIVE) != 0;
  }
  
  static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1107,8 +1150,8 @@ enum behavior {
  };
  
  /*
- * Attempt to check (or get) the page bit, and mark the
- * waiter woken if successful.
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
   */
  static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
                                         struct wait_queue_entry *wait)
@@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
         } else if (test_bit(bit_nr, &page->flags))
                 return false;
  
-       wait->flags |= WQ_FLAG_WOKEN;
+       wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
         return true;
  }
  
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         struct page *page, int bit_nr, int state, enum behavior behavior)
  {
+       int unfairness = sysctl_page_lock_unfairness;
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
         bool thrashing = false;
@@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         }
  
         init_wait(wait);
-       wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
         wait->func = wake_page_function;
         wait_page.page = page;
         wait_page.bit_nr = bit_nr;
  
+repeat:
+       wait->flags = 0;
+       if (behavior == EXCLUSIVE) {
+               wait->flags = WQ_FLAG_EXCLUSIVE;
+               if (--unfairness < 0)
+                       wait->flags |= WQ_FLAG_CUSTOM;
+       }
+
         /*
          * Do one last check whether we can get the
          * page bit synchronously.
@@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  
         /*
          * From now on, all the logic will be based on
-        * the WQ_FLAG_WOKEN flag, and the and the page
-        * bit testing (and setting) will be - or has
-        * already been - done by the wake function.
+        * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+        * see whether the page bit testing has already
+        * been done by the wake function.
          *
          * We can drop our reference to the page.
          */
         if (behavior == DROP)
                 put_page(page);
  
+       /*
+        * Note that until the "finish_wait()", or until
+        * we see the WQ_FLAG_WOKEN flag, we need to
+        * be very careful with the 'wait->flags', because
+        * we may race with a waker that sets them.
+        */
         for (;;) {
+               unsigned int flags;
+
                 set_current_state(state);
  
-               if (signal_pending_state(state, current))
+               /* Loop until we've been woken or interrupted */
+               flags = smp_load_acquire(&wait->flags);
+               if (!(flags & WQ_FLAG_WOKEN)) {
+                       if (signal_pending_state(state, current))
+                               break;
+
+                       io_schedule();
+                       continue;
+               }
+
+               /* If we were non-exclusive, we're done */
+               if (behavior != EXCLUSIVE)
                         break;
  
-               if (wait->flags & WQ_FLAG_WOKEN)
+               /* If the waker got the lock for us, we're done */
+               if (flags & WQ_FLAG_DONE)
                         break;
  
-               io_schedule();
+               /*
+                * Otherwise, if we're getting the lock, we need to
+                * try to get it ourselves.
+                *
+                * And if that fails, we'll have to retry this all.
+                */
+               if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+                       goto repeat;
+
+               wait->flags |= WQ_FLAG_DONE;
+               break;
         }
  
+       /*
+        * If a signal happened, this 'finish_wait()' may remove the last
+        * waiter from the wait-queues, but the PageWaiters bit will remain
+        * set. That's ok. The next wakeup will take care of it, and trying
+        * to do it here would be difficult and prone to races.
+        */
         finish_wait(q, wait);
  
         if (thrashing) {
@@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         }
  
         /*
-        * A signal could leave PageWaiters set. Clearing it here if
-        * !waitqueue_active would be possible (by open-coding finish_wait),
-        * but still fail to catch it in the case of wait hash collision. We
-        * already can fail to clear wait hash collision cases, so don't
-        * bother with signals either.
+        * NOTE! The wait->flags weren't stable until we've done the
+        * 'finish_wait()', and we could have exited the loop above due
+        * to a signal, and had a wakeup event happen after the signal
+        * test but before the 'finish_wait()'.
+        *
+        * So only after the finish_wait() can we reliably determine
+        * if we got woken up or not, so we can now figure out the final
+        * return value based on that state without races.
+        *
+        * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+        * waiter, but an exclusive one requires WQ_FLAG_DONE.
          */
+       if (behavior == EXCLUSIVE)
+               return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
  
         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Sun, 13 Sep 2020 21:05:35 +0000 (14:05 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 17 Sep 2020 17:26:41 +0000 (10:26 -0700)
include/linux/mm.h		patch \| blob \| history
include/linux/wait.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/filemap.c		patch \| blob \| history