hugetlb: parallelize 2M hugetlb allocation and initialization
authorGang Li <gang.li@linux.dev>
Thu, 22 Feb 2024 14:04:20 +0000 (22:04 +0800)
committerAndrew Morton <akpm@linux-foundation.org>
Wed, 6 Mar 2024 21:04:17 +0000 (13:04 -0800)
By distributing both the allocation and the initialization tasks across
multiple threads, the initialization of 2M hugetlb will be faster, thereby
improving the boot speed.

Here are some test results:
      test case        no patch(ms)   patched(ms)   saved
 ------------------- -------------- ------------- --------
  256c2T(4 node) 2M           3336          1051   68.52%
  128c1T(2 node) 2M           1943           716   63.15%

Link: https://lkml.kernel.org/r/20240222140422.393911-8-gang.li@linux.dev
Signed-off-by: Gang Li <ligang.bdlg@bytedance.com>
Tested-by: David Rientjes <rientjes@google.com>
Reviewed-by: Muchun Song <muchun.song@linux.dev>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Daniel Jordan <daniel.m.jordan@oracle.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Jane Chu <jane.chu@oracle.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Randy Dunlap <rdunlap@infradead.org>
Cc: Steffen Klassert <steffen.klassert@secunet.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
mm/hugetlb.c

index 1c8274f..9934ed8 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/delayacct.h>
 #include <linux/memory.h>
 #include <linux/mm_inline.h>
+#include <linux/padata.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -3510,6 +3511,30 @@ static void __init hugetlb_hstate_alloc_pages_errcheck(unsigned long allocated,
        }
 }
 
+static void __init hugetlb_pages_alloc_boot_node(unsigned long start, unsigned long end, void *arg)
+{
+       struct hstate *h = (struct hstate *)arg;
+       int i, num = end - start;
+       nodemask_t node_alloc_noretry;
+       LIST_HEAD(folio_list);
+       int next_node = first_online_node;
+
+       /* Bit mask controlling how hard we retry per-node allocations.*/
+       nodes_clear(node_alloc_noretry);
+
+       for (i = 0; i < num; ++i) {
+               struct folio *folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
+                                               &node_alloc_noretry, &next_node);
+               if (!folio)
+                       break;
+
+               list_move(&folio->lru, &folio_list);
+               cond_resched();
+       }
+
+       prep_and_add_allocated_folios(h, &folio_list);
+}
+
 static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 {
        unsigned long i;
@@ -3525,26 +3550,40 @@ static unsigned long __init hugetlb_gigantic_pages_alloc_boot(struct hstate *h)
 
 static unsigned long __init hugetlb_pages_alloc_boot(struct hstate *h)
 {
-       unsigned long i;
-       struct folio *folio;
-       LIST_HEAD(folio_list);
-       nodemask_t node_alloc_noretry;
-
-       /* Bit mask controlling how hard we retry per-node allocations.*/
-       nodes_clear(node_alloc_noretry);
+       struct padata_mt_job job = {
+               .fn_arg         = h,
+               .align          = 1,
+               .numa_aware     = true
+       };
 
-       for (i = 0; i < h->max_huge_pages; ++i) {
-               folio = alloc_pool_huge_folio(h, &node_states[N_MEMORY],
-                                               &node_alloc_noretry);
-               if (!folio)
-                       break;
-               list_add(&folio->lru, &folio_list);
-               cond_resched();
-       }
+       job.thread_fn   = hugetlb_pages_alloc_boot_node;
+       job.start       = 0;
+       job.size        = h->max_huge_pages;
 
-       prep_and_add_allocated_folios(h, &folio_list);
+       /*
+        * job.max_threads is twice the num_node_state(N_MEMORY),
+        *
+        * Tests below indicate that a multiplier of 2 significantly improves
+        * performance, and although larger values also provide improvements,
+        * the gains are marginal.
+        *
+        * Therefore, choosing 2 as the multiplier strikes a good balance between
+        * enhancing parallel processing capabilities and maintaining efficient
+        * resource management.
+        *
+        * +------------+-------+-------+-------+-------+-------+
+        * | multiplier |   1   |   2   |   3   |   4   |   5   |
+        * +------------+-------+-------+-------+-------+-------+
+        * | 256G 2node | 358ms | 215ms | 157ms | 134ms | 126ms |
+        * | 2T   4node | 979ms | 679ms | 543ms | 489ms | 481ms |
+        * | 50G  2node | 71ms  | 44ms  | 37ms  | 30ms  | 31ms  |
+        * +------------+-------+-------+-------+-------+-------+
+        */
+       job.max_threads = num_node_state(N_MEMORY) * 2;
+       job.min_chunk   = h->max_huge_pages / num_node_state(N_MEMORY) / 2;
+       padata_do_multithreaded(&job);
 
-       return i;
+       return h->nr_huge_pages;
 }
 
 /*