mm/page_alloc: allow high-order pages to be stored on the per-cpu lists

author Mel Gorman <mgorman@techsingularity.net>

Tue, 29 Jun 2021 02:43:08 +0000 (19:43 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
author Mel Gorman <mgorman@techsingularity.net>
Tue, 29 Jun 2021 02:43:08 +0000 (19:43 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index 578588d..265a32e 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -333,6 +333,24 @@ enum zone_watermarks {
         NR_WMARK
  };
  
+/*
+ * One per migratetype for each PAGE_ALLOC_COSTLY_ORDER plus one additional
+ * for pageblock size for THP if configured.
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_PCP_THP 1
+#else
+#define NR_PCP_THP 0
+#endif
+#define NR_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1 + NR_PCP_THP))
+
+/*
+ * Shift to encode migratetype and order in the same integer, with order
+ * in the least significant bits.
+ */
+#define NR_PCP_ORDER_WIDTH 8
+#define NR_PCP_ORDER_MASK ((1<<NR_PCP_ORDER_WIDTH) - 1)
+
  #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
  #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
  #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
@@ -349,7 +367,7 @@ struct per_cpu_pages {
  #endif
  
         /* Lists of pages, one per migrate type stored on the pcp-lists */
-       struct list_head lists[MIGRATE_PCPTYPES];
+       struct list_head lists[NR_PCP_LISTS];
  };
  
  struct per_cpu_zonestat {
diff --git a/mm/internal.h b/mm/internal.h

index 18e5fb4..6ec2cea 100644 (file)
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -203,7 +203,7 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
                                         gfp_t gfp_flags);
  extern int user_min_free_kbytes;
  
-extern void free_unref_page(struct page *page);
+extern void free_unref_page(struct page *page, unsigned int order);
  extern void free_unref_page_list(struct list_head *list);
  
  extern void zone_pcp_update(struct zone *zone, int cpu_online);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 0e441f1..34f097e 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -674,10 +674,53 @@ out:
         add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
  }
  
+static inline unsigned int order_to_pindex(int migratetype, int order)
+{
+       int base = order;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order > PAGE_ALLOC_COSTLY_ORDER) {
+               VM_BUG_ON(order != pageblock_order);
+               base = PAGE_ALLOC_COSTLY_ORDER + 1;
+       }
+#else
+       VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+       return (MIGRATE_PCPTYPES * base) + migratetype;
+}
+
+static inline int pindex_to_order(unsigned int pindex)
+{
+       int order = pindex / MIGRATE_PCPTYPES;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order > PAGE_ALLOC_COSTLY_ORDER) {
+               order = pageblock_order;
+               VM_BUG_ON(order != pageblock_order);
+       }
+#else
+       VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
+#endif
+
+       return order;
+}
+
+static inline bool pcp_allowed_order(unsigned int order)
+{
+       if (order <= PAGE_ALLOC_COSTLY_ORDER)
+               return true;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (order == pageblock_order)
+               return true;
+#endif
+       return false;
+}
+
  static inline void free_the_page(struct page *page, unsigned int order)
  {
-       if (order == 0)         /* Via pcp? */
-               free_unref_page(page);
+       if (pcp_allowed_order(order))           /* Via pcp? */
+               free_unref_page(page, order);
         else
                 __free_pages_ok(page, order, FPI_NONE);
  }
@@ -700,7 +743,7 @@ static inline void free_the_page(struct page *page, unsigned int order)
  void free_compound_page(struct page *page)
  {
         mem_cgroup_uncharge(page);
-       __free_pages_ok(page, compound_order(page), FPI_NONE);
+       free_the_page(page, compound_order(page));
  }
  
  void prep_compound_page(struct page *page, unsigned int order)
@@ -1350,9 +1393,9 @@ static __always_inline bool free_pages_prepare(struct page *page,
   * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
   * moved from pcp lists to free lists.
   */
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
  {
-       return free_pages_prepare(page, 0, true, FPI_NONE);
+       return free_pages_prepare(page, order, true, FPI_NONE);
  }
  
  static bool bulkfree_pcp_prepare(struct page *page)
@@ -1369,12 +1412,12 @@ static bool bulkfree_pcp_prepare(struct page *page)
   * debug_pagealloc enabled, they are checked also immediately when being freed
   * to the pcp lists.
   */
-static bool free_pcp_prepare(struct page *page)
+static bool free_pcp_prepare(struct page *page, unsigned int order)
  {
         if (debug_pagealloc_enabled_static())
-               return free_pages_prepare(page, 0, true, FPI_NONE);
+               return free_pages_prepare(page, order, true, FPI_NONE);
         else
-               return free_pages_prepare(page, 0, false, FPI_NONE);
+               return free_pages_prepare(page, order, false, FPI_NONE);
  }
  
  static bool bulkfree_pcp_prepare(struct page *page)
@@ -1406,8 +1449,10 @@ static inline void prefetch_buddy(struct page *page)
  static void free_pcppages_bulk(struct zone *zone, int count,
                                         struct per_cpu_pages *pcp)
  {
-       int migratetype = 0;
+       int pindex = 0;
         int batch_free = 0;
+       int nr_freed = 0;
+       unsigned int order;
         int prefetch_nr = READ_ONCE(pcp->batch);
         bool isolated_pageblocks;
         struct page *page, *tmp;
@@ -1418,7 +1463,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
          * below while (list_empty(list)) loop.
          */
         count = min(pcp->count, count);
-       while (count) {
+       while (count > 0) {
                 struct list_head *list;
  
                 /*
@@ -1430,24 +1475,31 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                  */
                 do {
                         batch_free++;
-                       if (++migratetype == MIGRATE_PCPTYPES)
-                               migratetype = 0;
-                       list = &pcp->lists[migratetype];
+                       if (++pindex == NR_PCP_LISTS)
+                               pindex = 0;
+                       list = &pcp->lists[pindex];
                 } while (list_empty(list));
  
                 /* This is the only non-empty list. Free them all. */
-               if (batch_free == MIGRATE_PCPTYPES)
+               if (batch_free == NR_PCP_LISTS)
                         batch_free = count;
  
+               order = pindex_to_order(pindex);
+               BUILD_BUG_ON(MAX_ORDER >= (1<<NR_PCP_ORDER_WIDTH));
                 do {
                         page = list_last_entry(list, struct page, lru);
                         /* must delete to avoid corrupting pcp list */
                         list_del(&page->lru);
-                       pcp->count--;
+                       nr_freed += 1 << order;
+                       count -= 1 << order;
  
                         if (bulkfree_pcp_prepare(page))
                                 continue;
  
+                       /* Encode order with the migratetype */
+                       page->index <<= NR_PCP_ORDER_WIDTH;
+                       page->index |= order;
+
                         list_add_tail(&page->lru, &head);
  
                         /*
@@ -1463,8 +1515,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                                 prefetch_buddy(page);
                                 prefetch_nr--;
                         }
-               } while (--count && --batch_free && !list_empty(list));
+               } while (count > 0 && --batch_free && !list_empty(list));
         }
+       pcp->count -= nr_freed;
  
         /*
          * local_lock_irq held so equivalent to spin_lock_irqsave for
@@ -1479,14 +1532,19 @@ static void free_pcppages_bulk(struct zone *zone, int count,
          */
         list_for_each_entry_safe(page, tmp, &head, lru) {
                 int mt = get_pcppage_migratetype(page);
+
+               /* mt has been encoded with the order (see above) */
+               order = mt & NR_PCP_ORDER_MASK;
+               mt >>= NR_PCP_ORDER_WIDTH;
+
                 /* MIGRATE_ISOLATE page should not go to pcplists */
                 VM_BUG_ON_PAGE(is_migrate_isolate(mt), page);
                 /* Pageblock could have been isolated meanwhile */
                 if (unlikely(isolated_pageblocks))
                         mt = get_pageblock_migratetype(page);
  
-               __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE);
-               trace_mm_page_pcpu_drain(page, 0, mt);
+               __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE);
+               trace_mm_page_pcpu_drain(page, order, mt);
         }
         spin_unlock(&zone->lock);
  }
@@ -3263,11 +3321,12 @@ void mark_free_pages(struct zone *zone)
  }
  #endif /* CONFIG_PM */
  
-static bool free_unref_page_prepare(struct page *page, unsigned long pfn)
+static bool free_unref_page_prepare(struct page *page, unsigned long pfn,
+                                                       unsigned int order)
  {
         int migratetype;
  
-       if (!free_pcp_prepare(page))
+       if (!free_pcp_prepare(page, order))
                 return false;
  
         migratetype = get_pfnblock_migratetype(page, pfn);
@@ -3317,16 +3376,18 @@ static int nr_pcp_high(struct per_cpu_pages *pcp, struct zone *zone)
  }
  
  static void free_unref_page_commit(struct page *page, unsigned long pfn,
-                                  int migratetype)
+                                  int migratetype, unsigned int order)
  {
         struct zone *zone = page_zone(page);
         struct per_cpu_pages *pcp;
         int high;
+       int pindex;
  
         __count_vm_event(PGFREE);
         pcp = this_cpu_ptr(zone->per_cpu_pageset);
-       list_add(&page->lru, &pcp->lists[migratetype]);
-       pcp->count++;
+       pindex = order_to_pindex(migratetype, order);
+       list_add(&page->lru, &pcp->lists[pindex]);
+       pcp->count += 1 << order;
         high = nr_pcp_high(pcp, zone);
         if (pcp->count >= high) {
                 int batch = READ_ONCE(pcp->batch);
@@ -3336,15 +3397,15 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn,
  }
  
  /*
- * Free a 0-order page
+ * Free a pcp page
   */
-void free_unref_page(struct page *page)
+void free_unref_page(struct page *page, unsigned int order)
  {
         unsigned long flags;
         unsigned long pfn = page_to_pfn(page);
         int migratetype;
  
-       if (!free_unref_page_prepare(page, pfn))
+       if (!free_unref_page_prepare(page, pfn, order))
                 return;
  
         /*
@@ -3357,14 +3418,14 @@ void free_unref_page(struct page *page)
         migratetype = get_pcppage_migratetype(page);
         if (unlikely(migratetype >= MIGRATE_PCPTYPES)) {
                 if (unlikely(is_migrate_isolate(migratetype))) {
-                       free_one_page(page_zone(page), page, pfn, 0, migratetype, FPI_NONE);
+                       free_one_page(page_zone(page), page, pfn, order, migratetype, FPI_NONE);
                         return;
                 }
                 migratetype = MIGRATE_MOVABLE;
         }
  
         local_lock_irqsave(&pagesets.lock, flags);
-       free_unref_page_commit(page, pfn, migratetype);
+       free_unref_page_commit(page, pfn, migratetype, order);
         local_unlock_irqrestore(&pagesets.lock, flags);
  }
  
@@ -3381,7 +3442,7 @@ void free_unref_page_list(struct list_head *list)
         /* Prepare pages for freeing */
         list_for_each_entry_safe(page, next, list, lru) {
                 pfn = page_to_pfn(page);
-               if (!free_unref_page_prepare(page, pfn))
+               if (!free_unref_page_prepare(page, pfn, 0))
                         list_del(&page->lru);
  
                 /*
@@ -3413,7 +3474,7 @@ void free_unref_page_list(struct list_head *list)
                 set_page_private(page, 0);
                 migratetype = get_pcppage_migratetype(page);
                 trace_mm_page_free_batched(page);
-               free_unref_page_commit(page, pfn, migratetype);
+               free_unref_page_commit(page, pfn, migratetype, 0);
  
                 /*
                  * Guard against excessive IRQ disabled times when we get
@@ -3549,7 +3610,8 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z,
  
  /* Remove page from the per-cpu list, caller must protect the list */
  static inline
-struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+struct page *__rmqueue_pcplist(struct zone *zone, unsigned int order,
+                       int migratetype,
                         unsigned int alloc_flags,
                         struct per_cpu_pages *pcp,
                         struct list_head *list)
@@ -3558,16 +3620,30 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
  
         do {
                 if (list_empty(list)) {
-                       pcp->count += rmqueue_bulk(zone, 0,
-                                       READ_ONCE(pcp->batch), list,
+                       int batch = READ_ONCE(pcp->batch);
+                       int alloced;
+
+                       /*
+                        * Scale batch relative to order if batch implies
+                        * free pages can be stored on the PCP. Batch can
+                        * be 1 for small zones or for boot pagesets which
+                        * should never store free pages as the pages may
+                        * belong to arbitrary zones.
+                        */
+                       if (batch > 1)
+                               batch = max(batch >> order, 2);
+                       alloced = rmqueue_bulk(zone, order,
+                                       batch, list,
                                         migratetype, alloc_flags);
+
+                       pcp->count += alloced << order;
                         if (unlikely(list_empty(list)))
                                 return NULL;
                 }
  
                 page = list_first_entry(list, struct page, lru);
                 list_del(&page->lru);
-               pcp->count--;
+               pcp->count -= 1 << order;
         } while (check_new_pcp(page));
  
         return page;
@@ -3575,8 +3651,9 @@ struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
  
  /* Lock and remove page from the per-cpu list */
  static struct page *rmqueue_pcplist(struct zone *preferred_zone,
-                       struct zone *zone, gfp_t gfp_flags,
-                       int migratetype, unsigned int alloc_flags)
+                       struct zone *zone, unsigned int order,
+                       gfp_t gfp_flags, int migratetype,
+                       unsigned int alloc_flags)
  {
         struct per_cpu_pages *pcp;
         struct list_head *list;
@@ -3592,8 +3669,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone,
          */
         pcp = this_cpu_ptr(zone->per_cpu_pageset);
         pcp->free_factor >>= 1;
-       list = &pcp->lists[migratetype];
-       page = __rmqueue_pcplist(zone,  migratetype, alloc_flags, pcp, list);
+       list = &pcp->lists[order_to_pindex(migratetype, order)];
+       page = __rmqueue_pcplist(zone, order, migratetype, alloc_flags, pcp, list);
         local_unlock_irqrestore(&pagesets.lock, flags);
         if (page) {
                 __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
@@ -3614,15 +3691,15 @@ struct page *rmqueue(struct zone *preferred_zone,
         unsigned long flags;
         struct page *page;
  
-       if (likely(order == 0)) {
+       if (likely(pcp_allowed_order(order))) {
                 /*
                  * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
                  * we need to skip it when CMA area isn't allowed.
                  */
                 if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
                                 migratetype != MIGRATE_MOVABLE) {
-                       page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
-                                       migratetype, alloc_flags);
+                       page = rmqueue_pcplist(preferred_zone, zone, order,
+                                       gfp_flags, migratetype, alloc_flags);
                         goto out;
                 }
         }
@@ -5201,7 +5278,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
         /* Attempt the batch allocation */
         local_lock_irqsave(&pagesets.lock, flags);
         pcp = this_cpu_ptr(zone->per_cpu_pageset);
-       pcp_list = &pcp->lists[ac.migratetype];
+       pcp_list = &pcp->lists[order_to_pindex(ac.migratetype, 0)];
  
         while (nr_populated < nr_pages) {
  
@@ -5211,7 +5288,7 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid,
                         continue;
                 }
  
-               page = __rmqueue_pcplist(zone, ac.migratetype, alloc_flags,
+               page = __rmqueue_pcplist(zone, 0, ac.migratetype, alloc_flags,
                                                                 pcp, pcp_list);
                 if (unlikely(!page)) {
                         /* Try and get at least one page */
@@ -6778,13 +6855,13 @@ static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
  
  static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonestat *pzstats)
  {
-       int migratetype;
+       int pindex;
  
         memset(pcp, 0, sizeof(*pcp));
         memset(pzstats, 0, sizeof(*pzstats));
  
-       for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
-               INIT_LIST_HEAD(&pcp->lists[migratetype]);
+       for (pindex = 0; pindex < NR_PCP_LISTS; pindex++)
+               INIT_LIST_HEAD(&pcp->lists[pindex]);
  
         /*
          * Set batch and high values safe for a boot pageset. A true percpu
diff --git a/mm/swap.c b/mm/swap.c

index 18cc9e6..6c11db7 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -95,7 +95,7 @@ static void __put_single_page(struct page *page)
  {
         __page_cache_release(page);
         mem_cgroup_uncharge(page);
-       free_unref_page(page);
+       free_unref_page(page, 0);
  }
  
  static void __put_compound_page(struct page *page)
author	Mel Gorman <mgorman@techsingularity.net>
	Tue, 29 Jun 2021 02:43:08 +0000 (19:43 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
include/linux/mmzone.h		patch \| blob \| history
mm/internal.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history