mm: munlock: fix potential race with THP page split

author Vlastimil Babka <vbabka@suse.cz>

Thu, 23 Jan 2014 23:52:50 +0000 (15:52 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 24 Jan 2014 00:36:50 +0000 (16:36 -0800)
author Vlastimil Babka <vbabka@suse.cz>
Thu, 23 Jan 2014 23:52:50 +0000 (15:52 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 24 Jan 2014 00:36:50 +0000 (16:36 -0800)
diff --git a/mm/mlock.c b/mm/mlock.c

index 10819ed..b30adbe 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -90,6 +90,26 @@ void mlock_vma_page(struct page *page)
         }
  }
  
+/*
+ * Isolate a page from LRU with optional get_page() pin.
+ * Assumes lru_lock already held and page already pinned.
+ */
+static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
+{
+       if (PageLRU(page)) {
+               struct lruvec *lruvec;
+
+               lruvec = mem_cgroup_page_lruvec(page, page_zone(page));
+               if (getpage)
+                       get_page(page);
+               ClearPageLRU(page);
+               del_page_from_lru_list(page, lruvec, page_lru(page));
+               return true;
+       }
+
+       return false;
+}
+
  /*
   * Finish munlock after successful page isolation
   *
@@ -126,9 +146,9 @@ static void __munlock_isolated_page(struct page *page)
  static void __munlock_isolation_failed(struct page *page)
  {
         if (PageUnevictable(page))
-               count_vm_event(UNEVICTABLE_PGSTRANDED);
+               __count_vm_event(UNEVICTABLE_PGSTRANDED);
         else
-               count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+               __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
  }
  
  /**
@@ -152,28 +172,34 @@ static void __munlock_isolation_failed(struct page *page)
  unsigned int munlock_vma_page(struct page *page)
  {
         unsigned int nr_pages;
+       struct zone *zone = page_zone(page);
  
         BUG_ON(!PageLocked(page));
  
-       if (TestClearPageMlocked(page)) {
-               nr_pages = hpage_nr_pages(page);
-               mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
-               if (!isolate_lru_page(page))
-                       __munlock_isolated_page(page);
-               else
-                       __munlock_isolation_failed(page);
-       } else {
-               nr_pages = hpage_nr_pages(page);
-       }
-
         /*
-        * Regardless of the original PageMlocked flag, we determine nr_pages
-        * after touching the flag. This leaves a possible race with a THP page
-        * split, such that a whole THP page was munlocked, but nr_pages == 1.
-        * Returning a smaller mask due to that is OK, the worst that can
-        * happen is subsequent useless scanning of the former tail pages.
-        * The NR_MLOCK accounting can however become broken.
+        * Serialize with any parallel __split_huge_page_refcount() which
+        * might otherwise copy PageMlocked to part of the tail pages before
+        * we clear it in the head page. It also stabilizes hpage_nr_pages().
          */
+       spin_lock_irq(&zone->lru_lock);
+
+       nr_pages = hpage_nr_pages(page);
+       if (!TestClearPageMlocked(page))
+               goto unlock_out;
+
+       __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+
+       if (__munlock_isolate_lru_page(page, true)) {
+               spin_unlock_irq(&zone->lru_lock);
+               __munlock_isolated_page(page);
+               goto out;
+       }
+       __munlock_isolation_failed(page);
+
+unlock_out:
+       spin_unlock_irq(&zone->lru_lock);
+
+out:
         return nr_pages - 1;
  }
  
@@ -310,34 +336,24 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
                 struct page *page = pvec->pages[i];
  
                 if (TestClearPageMlocked(page)) {
-                       struct lruvec *lruvec;
-                       int lru;
-
-                       if (PageLRU(page)) {
-                               lruvec = mem_cgroup_page_lruvec(page, zone);
-                               lru = page_lru(page);
-                               /*
-                                * We already have pin from follow_page_mask()
-                                * so we can spare the get_page() here.
-                                */
-                               ClearPageLRU(page);
-                               del_page_from_lru_list(page, lruvec, lru);
-                       } else {
-                               __munlock_isolation_failed(page);
-                               goto skip_munlock;
-                       }
-
-               } else {
-skip_munlock:
                         /*
-                        * We won't be munlocking this page in the next phase
-                        * but we still need to release the follow_page_mask()
-                        * pin. We cannot do it under lru_lock however. If it's
-                        * the last pin, __page_cache_release would deadlock.
+                        * We already have pin from follow_page_mask()
+                        * so we can spare the get_page() here.
                          */
-                       pagevec_add(&pvec_putback, pvec->pages[i]);
-                       pvec->pages[i] = NULL;
+                       if (__munlock_isolate_lru_page(page, false))
+                               continue;
+                       else
+                               __munlock_isolation_failed(page);
                 }
+
+               /*
+                * We won't be munlocking this page in the next phase
+                * but we still need to release the follow_page_mask()
+                * pin. We cannot do it under lru_lock however. If it's
+                * the last pin, __page_cache_release() would deadlock.
+                */
+               pagevec_add(&pvec_putback, pvec->pages[i]);
+               pvec->pages[i] = NULL;
         }
         delta_munlocked = -nr + pagevec_count(&pvec_putback);
         __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
author	Vlastimil Babka <vbabka@suse.cz>
	Thu, 23 Jan 2014 23:52:50 +0000 (15:52 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 24 Jan 2014 00:36:50 +0000 (16:36 -0800)