mm: fix check_move_unevictable_pages() on THP

[linux-2.6-microblaze.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 8e75bce..6aa08e7 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -198,7 +198,7 @@ static void unaccount_page_cache_page(struct address_space *mapping,
         if (PageHuge(page))
                 return;
  
-       nr = hpage_nr_pages(page);
+       nr = thp_nr_pages(page);
  
         __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
         if (PageSwapBacked(page)) {
@@ -988,9 +988,43 @@ void __init pagecache_init(void)
         page_writeback_init();
  }
  
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just he usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ *  (a) no special bits set:
+ *
+ *     We're just waiting for the bit to be released, and when a waker
+ *     calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ *     and remove it from the wait queue.
+ *
+ *     Simple and straightforward.
+ *
+ *  (b) WQ_FLAG_EXCLUSIVE:
+ *
+ *     The waiter is waiting to get the lock, and only one waiter should
+ *     be woken up to avoid any thundering herd behavior. We'll set the
+ *     WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ *     This is the traditional exclusive wait.
+ *
+ *  (b) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ *     The waiter is waiting to get the bit, and additionally wants the
+ *     lock to be transferred to it for fair lock behavior. If the lock
+ *     cannot be taken, we stop walking the wait queue without waking
+ *     the waiter.
+ *
+ *     This is the "fair lock handoff" case, and in addition to setting
+ *     WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ *     that it now has the lock.
+ */
  static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
  {
-       int ret;
+       unsigned int flags;
         struct wait_page_key *key = arg;
         struct wait_page_queue *wait_page
                 = container_of(wait, struct wait_page_queue, wait);
@@ -999,35 +1033,44 @@ static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync,
                 return 0;
  
         /*
-        * If it's an exclusive wait, we get the bit for it, and
-        * stop walking if we can't.
-        *
-        * If it's a non-exclusive wait, then the fact that this
-        * wake function was called means that the bit already
-        * was cleared, and we don't care if somebody then
-        * re-took it.
+        * If it's a lock handoff wait, we get the bit for it, and
+        * stop walking (and do not wake it up) if we can't.
          */
-       ret = 0;
-       if (wait->flags & WQ_FLAG_EXCLUSIVE) {
-               if (test_and_set_bit(key->bit_nr, &key->page->flags))
+       flags = wait->flags;
+       if (flags & WQ_FLAG_EXCLUSIVE) {
+               if (test_bit(key->bit_nr, &key->page->flags))
                         return -1;
-               ret = 1;
+               if (flags & WQ_FLAG_CUSTOM) {
+                       if (test_and_set_bit(key->bit_nr, &key->page->flags))
+                               return -1;
+                       flags |= WQ_FLAG_DONE;
+               }
         }
-       wait->flags |= WQ_FLAG_WOKEN;
  
+       /*
+        * We are holding the wait-queue lock, but the waiter that
+        * is waiting for this will be checking the flags without
+        * any locking.
+        *
+        * So update the flags atomically, and wake up the waiter
+        * afterwards to avoid any races. This store-release pairs
+        * with the load-acquire in wait_on_page_bit_common().
+        */
+       smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
         wake_up_state(wait->private, mode);
  
         /*
          * Ok, we have successfully done what we're waiting for,
          * and we can unconditionally remove the wait entry.
          *
-        * Note that this has to be the absolute last thing we do,
-        * since after list_del_init(&wait->entry) the wait entry
+        * Note that this pairs with the "finish_wait()" in the
+        * waiter, and has to be the absolute last thing we do.
+        * After this list_del_init(&wait->entry) the wait entry
          * might be de-allocated and the process might even have
          * exited.
          */
         list_del_init_careful(&wait->entry);
-       return ret;
+       return (flags & WQ_FLAG_EXCLUSIVE) != 0;
  }
  
  static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1107,8 +1150,8 @@ enum behavior {
  };
  
  /*
- * Attempt to check (or get) the page bit, and mark the
- * waiter woken if successful.
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
   */
  static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
                                         struct wait_queue_entry *wait)
@@ -1119,13 +1162,17 @@ static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
         } else if (test_bit(bit_nr, &page->flags))
                 return false;
  
-       wait->flags |= WQ_FLAG_WOKEN;
+       wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
         return true;
  }
  
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
  static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         struct page *page, int bit_nr, int state, enum behavior behavior)
  {
+       int unfairness = sysctl_page_lock_unfairness;
         struct wait_page_queue wait_page;
         wait_queue_entry_t *wait = &wait_page.wait;
         bool thrashing = false;
@@ -1143,11 +1190,18 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         }
  
         init_wait(wait);
-       wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
         wait->func = wake_page_function;
         wait_page.page = page;
         wait_page.bit_nr = bit_nr;
  
+repeat:
+       wait->flags = 0;
+       if (behavior == EXCLUSIVE) {
+               wait->flags = WQ_FLAG_EXCLUSIVE;
+               if (--unfairness < 0)
+                       wait->flags |= WQ_FLAG_CUSTOM;
+       }
+
         /*
          * Do one last check whether we can get the
          * page bit synchronously.
@@ -1170,27 +1224,63 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
  
         /*
          * From now on, all the logic will be based on
-        * the WQ_FLAG_WOKEN flag, and the and the page
-        * bit testing (and setting) will be - or has
-        * already been - done by the wake function.
+        * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+        * see whether the page bit testing has already
+        * been done by the wake function.
          *
          * We can drop our reference to the page.
          */
         if (behavior == DROP)
                 put_page(page);
  
+       /*
+        * Note that until the "finish_wait()", or until
+        * we see the WQ_FLAG_WOKEN flag, we need to
+        * be very careful with the 'wait->flags', because
+        * we may race with a waker that sets them.
+        */
         for (;;) {
+               unsigned int flags;
+
                 set_current_state(state);
  
-               if (signal_pending_state(state, current))
+               /* Loop until we've been woken or interrupted */
+               flags = smp_load_acquire(&wait->flags);
+               if (!(flags & WQ_FLAG_WOKEN)) {
+                       if (signal_pending_state(state, current))
+                               break;
+
+                       io_schedule();
+                       continue;
+               }
+
+               /* If we were non-exclusive, we're done */
+               if (behavior != EXCLUSIVE)
                         break;
  
-               if (wait->flags & WQ_FLAG_WOKEN)
+               /* If the waker got the lock for us, we're done */
+               if (flags & WQ_FLAG_DONE)
                         break;
  
-               io_schedule();
+               /*
+                * Otherwise, if we're getting the lock, we need to
+                * try to get it ourselves.
+                *
+                * And if that fails, we'll have to retry this all.
+                */
+               if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+                       goto repeat;
+
+               wait->flags |= WQ_FLAG_DONE;
+               break;
         }
  
+       /*
+        * If a signal happened, this 'finish_wait()' may remove the last
+        * waiter from the wait-queues, but the PageWaiters bit will remain
+        * set. That's ok. The next wakeup will take care of it, and trying
+        * to do it here would be difficult and prone to races.
+        */
         finish_wait(q, wait);
  
         if (thrashing) {
@@ -1200,12 +1290,20 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
         }
  
         /*
-        * A signal could leave PageWaiters set. Clearing it here if
-        * !waitqueue_active would be possible (by open-coding finish_wait),
-        * but still fail to catch it in the case of wait hash collision. We
-        * already can fail to clear wait hash collision cases, so don't
-        * bother with signals either.
+        * NOTE! The wait->flags weren't stable until we've done the
+        * 'finish_wait()', and we could have exited the loop above due
+        * to a signal, and had a wakeup event happen after the signal
+        * test but before the 'finish_wait()'.
+        *
+        * So only after the finish_wait() can we reliably determine
+        * if we got woken up or not, so we can now figure out the final
+        * return value based on that state without races.
+        *
+        * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+        * waiter, but an exclusive one requires WQ_FLAG_DONE.
          */
+       if (behavior == EXCLUSIVE)
+               return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
  
         return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
  }
@@ -2468,6 +2566,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
         struct address_space *mapping = file->f_mapping;
         struct file *fpin = NULL;
         pgoff_t offset = vmf->pgoff;
+       unsigned int mmap_miss;
  
         /* If we don't want any read-ahead, don't bother */
         if (vmf->vma->vm_flags & VM_RAND_READ)
@@ -2483,14 +2582,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
         }
  
         /* Avoid banging the cache line if not needed */
-       if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
-               ra->mmap_miss++;
+       mmap_miss = READ_ONCE(ra->mmap_miss);
+       if (mmap_miss < MMAP_LOTSAMISS * 10)
+               WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
  
         /*
          * Do we miss much more than hit in this file? If so,
          * stop bothering with read-ahead. It will only hurt.
          */
-       if (ra->mmap_miss > MMAP_LOTSAMISS)
+       if (mmap_miss > MMAP_LOTSAMISS)
                 return fpin;
  
         /*
@@ -2516,13 +2616,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
         struct file_ra_state *ra = &file->f_ra;
         struct address_space *mapping = file->f_mapping;
         struct file *fpin = NULL;
+       unsigned int mmap_miss;
         pgoff_t offset = vmf->pgoff;
  
         /* If we don't want any read-ahead, don't bother */
         if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
                 return fpin;
-       if (ra->mmap_miss > 0)
-               ra->mmap_miss--;
+       mmap_miss = READ_ONCE(ra->mmap_miss);
+       if (mmap_miss)
+               WRITE_ONCE(ra->mmap_miss, --mmap_miss);
         if (PageReadahead(page)) {
                 fpin = maybe_unlock_mmap_for_io(vmf, fpin);
                 page_cache_async_readahead(mapping, ra, file,
@@ -2688,6 +2790,7 @@ void filemap_map_pages(struct vm_fault *vmf,
         unsigned long max_idx;
         XA_STATE(xas, &mapping->i_pages, start_pgoff);
         struct page *page;
+       unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
  
         rcu_read_lock();
         xas_for_each(&xas, page, end_pgoff) {
@@ -2724,8 +2827,8 @@ void filemap_map_pages(struct vm_fault *vmf,
                 if (page->index >= max_idx)
                         goto unlock;
  
-               if (file->f_ra.mmap_miss > 0)
-                       file->f_ra.mmap_miss--;
+               if (mmap_miss > 0)
+                       mmap_miss--;
  
                 vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
                 if (vmf->pte)
@@ -2745,6 +2848,7 @@ next:
                         break;
         }
         rcu_read_unlock();
+       WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
  }
  EXPORT_SYMBOL(filemap_map_pages);