mm/gup: page->hpage_pinned_refcount: exact pin counts for huge pages

author John Hubbard <jhubbard@nvidia.com>

Thu, 2 Apr 2020 04:05:33 +0000 (21:05 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Thu, 2 Apr 2020 16:35:27 +0000 (09:35 -0700)
author John Hubbard <jhubbard@nvidia.com>
Thu, 2 Apr 2020 04:05:33 +0000 (21:05 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Thu, 2 Apr 2020 16:35:27 +0000 (09:35 -0700)
diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst

index 9829345..7e5dd8b 100644 (file)
--- a/Documentation/core-api/pin_user_pages.rst
+++ b/Documentation/core-api/pin_user_pages.rst
@@ -52,8 +52,22 @@ Which flags are set by each wrapper
  
  For these pin_user_pages*() functions, FOLL_PIN is OR'd in with whatever gup
  flags the caller provides. The caller is required to pass in a non-null struct
-pages* array, and the function then pin pages by incrementing each by a special
-value. For now, that value is +1, just like get_user_pages*().::
+pages* array, and the function then pins pages by incrementing each by a special
+value: GUP_PIN_COUNTING_BIAS.
+
+For huge pages (and in fact, any compound page of more than 2 pages), the
+GUP_PIN_COUNTING_BIAS scheme is not used. Instead, an exact form of pin counting
+is achieved, by using the 3rd struct page in the compound page. A new struct
+page field, hpage_pinned_refcount, has been added in order to support this.
+
+This approach for compound pages avoids the counting upper limit problems that
+are discussed below. Those limitations would have been aggravated severely by
+huge pages, because each tail page adds a refcount to the head page. And in
+fact, testing revealed that, without a separate hpage_pinned_refcount field,
+page overflows were seen in some huge page stress tests.
+
+This also means that huge pages and compound pages (of order > 1) do not suffer
+from the false positives problem that is mentioned below.::
  
   Function
   --------
@@ -99,27 +113,6 @@ pages:
  This also leads to limitations: there are only 31-10==21 bits available for a
  counter that increments 10 bits at a time.
  
-TODO: for 1GB and larger huge pages, this is cutting it close. That's because
-when pin_user_pages() follows such pages, it increments the head page by "1"
-(where "1" used to mean "+1" for get_user_pages(), but now means "+1024" for
-pin_user_pages()) for each tail page. So if you have a 1GB huge page:
-
-* There are 256K (18 bits) worth of 4 KB tail pages.
-* There are 21 bits available to count up via GUP_PIN_COUNTING_BIAS (that is,
-  10 bits at a time)
-* There are 21 - 18 == 3 bits available to count. Except that there aren't,
-  because you need to allow for a few normal get_page() calls on the head page,
-  as well. Fortunately, the approach of using addition, rather than "hard"
-  bitfields, within page->_refcount, allows for sharing these bits gracefully.
-  But we're still looking at about 8 references.
-
-This, however, is a missing feature more than anything else, because it's easily
-solved by addressing an obvious inefficiency in the original get_user_pages()
-approach of retrieving pages: stop treating all the pages as if they were
-PAGE_SIZE. Retrieve huge pages as huge pages. The callers need to be aware of
-this, so some work is required. Once that's in place, this limitation mostly
-disappears from view, because there will be ample refcounting range available.
-
  * Callers must specifically request "dma-pinned tracking of pages". In other
    words, just calling get_user_pages() will not suffice; a new set of functions,
    pin_user_page() and related, must be used.
@@ -228,5 +221,6 @@ References
  * `Some slow progress on get_user_pages() (Apr 2, 2019) <https://lwn.net/Articles/784574/>`_
  * `DMA and get_user_pages() (LPC: Dec 12, 2018) <https://lwn.net/Articles/774411/>`_
  * `The trouble with get_user_pages() (Apr 30, 2018) <https://lwn.net/Articles/753027/>`_
+* `LWN kernel index: get_user_pages() <https://lwn.net/Kernel/Index/#Memory_management-get_user_pages>`_
  
  John Hubbard, October, 2019
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 10be09c..6a426f8 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -770,6 +770,24 @@ static inline unsigned int compound_order(struct page *page)
         return page[1].compound_order;
  }
  
+static inline bool hpage_pincount_available(struct page *page)
+{
+       /*
+        * Can the page->hpage_pinned_refcount field be used? That field is in
+        * the 3rd page of the compound page, so the smallest (2-page) compound
+        * pages cannot support it.
+        */
+       page = compound_head(page);
+       return PageCompound(page) && compound_order(page) > 1;
+}
+
+static inline int compound_pincount(struct page *page)
+{
+       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+       page = compound_head(page);
+       return atomic_read(compound_pincount_ptr(page));
+}
+
  static inline void set_compound_order(struct page *page, unsigned int order)
  {
         page[1].compound_order = order;
@@ -1084,6 +1102,11 @@ void unpin_user_pages(struct page **pages, unsigned long npages);
   * refcounts, and b) all the callers of this routine are expected to be able to
   * deal gracefully with a false positive.
   *
+ * For huge pages, the result will be exactly correct. That's because we have
+ * more tracking data available: the 3rd struct page in the compound page is
+ * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS
+ * scheme).
+ *
   * For more information, please see Documentation/vm/pin_user_pages.rst.
   *
   * @page:      pointer to page to be queried.
@@ -1092,6 +1115,9 @@ void unpin_user_pages(struct page **pages, unsigned long npages);
   */
  static inline bool page_maybe_dma_pinned(struct page *page)
  {
+       if (hpage_pincount_available(page))
+               return compound_pincount(page) > 0;
+
         /*
          * page_ref_count() is signed. If that refcount overflows, then
          * page_ref_count() returns a negative value, and callers will avoid
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c28911c..dd555e6 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -137,7 +137,7 @@ struct page {
                 };
                 struct {        /* Second tail page of compound page */
                         unsigned long _compound_pad_1;  /* compound_head */
-                       unsigned long _compound_pad_2;
+                       atomic_t hpage_pinned_refcount;
                         /* For both global and memcg */
                         struct list_head deferred_list;
                 };
@@ -226,6 +226,11 @@ static inline atomic_t *compound_mapcount_ptr(struct page *page)
         return &page[1].compound_mapcount;
  }
  
+static inline atomic_t *compound_pincount_ptr(struct page *page)
+{
+       return &page[2].hpage_pinned_refcount;
+}
+
  /*
   * Used for sizing the vmemmap region on some architectures
   */
diff --git a/mm/gup.c b/mm/gup.c

index ee4f14f..6601df4 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -29,6 +29,22 @@ struct follow_page_context {
         unsigned int page_mask;
  };
  
+static void hpage_pincount_add(struct page *page, int refs)
+{
+       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+       VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+       atomic_add(refs, compound_pincount_ptr(page));
+}
+
+static void hpage_pincount_sub(struct page *page, int refs)
+{
+       VM_BUG_ON_PAGE(!hpage_pincount_available(page), page);
+       VM_BUG_ON_PAGE(page != compound_head(page), page);
+
+       atomic_sub(refs, compound_pincount_ptr(page));
+}
+
  /*
   * Return the compound head page with ref appropriately incremented,
   * or NULL if that failed.
@@ -70,8 +86,25 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page,
         if (flags & FOLL_GET)
                 return try_get_compound_head(page, refs);
         else if (flags & FOLL_PIN) {
-               refs *= GUP_PIN_COUNTING_BIAS;
-               return try_get_compound_head(page, refs);
+               /*
+                * When pinning a compound page of order > 1 (which is what
+                * hpage_pincount_available() checks for), use an exact count to
+                * track it, via hpage_pincount_add/_sub().
+                *
+                * However, be sure to *also* increment the normal page refcount
+                * field at least once, so that the page really is pinned.
+                */
+               if (!hpage_pincount_available(page))
+                       refs *= GUP_PIN_COUNTING_BIAS;
+
+               page = try_get_compound_head(page, refs);
+               if (!page)
+                       return NULL;
+
+               if (hpage_pincount_available(page))
+                       hpage_pincount_add(page, refs);
+
+               return page;
         }
  
         WARN_ON_ONCE(1);
@@ -106,12 +139,25 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
         if (flags & FOLL_GET)
                 return try_get_page(page);
         else if (flags & FOLL_PIN) {
+               int refs = 1;
+
                 page = compound_head(page);
  
                 if (WARN_ON_ONCE(page_ref_count(page) <= 0))
                         return false;
  
-               page_ref_add(page, GUP_PIN_COUNTING_BIAS);
+               if (hpage_pincount_available(page))
+                       hpage_pincount_add(page, 1);
+               else
+                       refs = GUP_PIN_COUNTING_BIAS;
+
+               /*
+                * Similar to try_grab_compound_head(): even if using the
+                * hpage_pincount_add/_sub() routines, be sure to
+                * *also* increment the normal page refcount field at least
+                * once, so that the page really is pinned.
+                */
+               page_ref_add(page, refs);
         }
  
         return true;
@@ -120,12 +166,17 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
  #ifdef CONFIG_DEV_PAGEMAP_OPS
  static bool __unpin_devmap_managed_user_page(struct page *page)
  {
-       int count;
+       int count, refs = 1;
  
         if (!page_is_devmap_managed(page))
                 return false;
  
-       count = page_ref_sub_return(page, GUP_PIN_COUNTING_BIAS);
+       if (hpage_pincount_available(page))
+               hpage_pincount_sub(page, 1);
+       else
+               refs = GUP_PIN_COUNTING_BIAS;
+
+       count = page_ref_sub_return(page, refs);
  
         /*
          * devmap page refcounts are 1-based, rather than 0-based: if
@@ -157,6 +208,8 @@ static bool __unpin_devmap_managed_user_page(struct page *page)
   */
  void unpin_user_page(struct page *page)
  {
+       int refs = 1;
+
         page = compound_head(page);
  
         /*
@@ -168,7 +221,12 @@ void unpin_user_page(struct page *page)
         if (__unpin_devmap_managed_user_page(page))
                 return;
  
-       if (page_ref_sub_and_test(page, GUP_PIN_COUNTING_BIAS))
+       if (hpage_pincount_available(page))
+               hpage_pincount_sub(page, 1);
+       else
+               refs = GUP_PIN_COUNTING_BIAS;
+
+       if (page_ref_sub_and_test(page, refs))
                 __put_page(page);
  }
  EXPORT_SYMBOL(unpin_user_page);
@@ -1955,8 +2013,12 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
  
  static void put_compound_head(struct page *page, int refs, unsigned int flags)
  {
-       if (flags & FOLL_PIN)
-               refs *= GUP_PIN_COUNTING_BIAS;
+       if (flags & FOLL_PIN) {
+               if (hpage_pincount_available(page))
+                       hpage_pincount_sub(page, refs);
+               else
+                       refs *= GUP_PIN_COUNTING_BIAS;
+       }
  
         VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
         /*
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index ba1de6b..3d31a23 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1009,6 +1009,9 @@ static void destroy_compound_gigantic_page(struct page *page,
         struct page *p = page + 1;
  
         atomic_set(compound_mapcount_ptr(page), 0);
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
+
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                 clear_compound_head(p);
                 set_page_refcounted(p);
@@ -1287,6 +1290,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                 set_compound_head(p, page);
         }
         atomic_set(compound_mapcount_ptr(page), -1);
+
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
  }
  
  /*
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6e7e9c1..8f3a3bf 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -688,6 +688,8 @@ void prep_compound_page(struct page *page, unsigned int order)
                 set_compound_head(p, page);
         }
         atomic_set(compound_mapcount_ptr(page), -1);
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
  }
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
diff --git a/mm/rmap.c b/mm/rmap.c

index b3e3819..e45b9b9 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1178,6 +1178,9 @@ void page_add_new_anon_rmap(struct page *page,
                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
                 /* increment count (starts at -1) */
                 atomic_set(compound_mapcount_ptr(page), 0);
+               if (hpage_pincount_available(page))
+                       atomic_set(compound_pincount_ptr(page), 0);
+
                 __inc_node_page_state(page, NR_ANON_THPS);
         } else {
                 /* Anon THP always mapped first with PMD */
@@ -1974,6 +1977,9 @@ void hugepage_add_new_anon_rmap(struct page *page,
  {
         BUG_ON(address < vma->vm_start || address >= vma->vm_end);
         atomic_set(compound_mapcount_ptr(page), 0);
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
+
         __page_set_anon_rmap(page, vma, address, 1);
  }
  #endif /* CONFIG_HUGETLB_PAGE */
author	John Hubbard <jhubbard@nvidia.com>
	Thu, 2 Apr 2020 04:05:33 +0000 (21:05 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Thu, 2 Apr 2020 16:35:27 +0000 (09:35 -0700)
Documentation/core-api/pin_user_pages.rst		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
mm/gup.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history