mm/page_alloc: introduce vm.percpu_pagelist_high_fraction

author Mel Gorman <mgorman@techsingularity.net>

Tue, 29 Jun 2021 02:42:24 +0000 (19:42 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
author Mel Gorman <mgorman@techsingularity.net>
Tue, 29 Jun 2021 02:42:24 +0000 (19:42 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst

index 2fcafcc..2da2573 100644 (file)
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm:
  - overcommit_ratio
  - page-cluster
  - panic_on_oom
+- percpu_pagelist_high_fraction
  - stat_interval
  - stat_refresh
  - numa_stat
@@ -789,6 +790,26 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
  why oom happens. You can get snapshot.
  
  
+percpu_pagelist_high_fraction
+=============================
+
+This is the fraction of pages in each zone that are can be stored to
+per-cpu page lists. It is an upper boundary that is divided depending
+on the number of online CPUs. The min value for this is 8 which means
+that we do not allow more than 1/8th of pages in each zone to be stored
+on per-cpu page lists. This entry only changes the value of hot per-cpu
+page lists. A user can specify a number like 100 to allocate 1/100th of
+each zone between per-cpu lists.
+
+The batch value of each per-cpu page list remains the same regardless of
+the value of the high fraction so allocation latencies are unaffected.
+
+The initial value is zero. Kernel uses this value to set the high pcp->high
+mark based on the low watermark for the zone and the number of local
+online CPUs.  If the user writes '0' to this sysctl, it will revert to
+this default behavior.
+
+
  stat_interval
  =============
  
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index b2f40d6..7d206ca 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -1029,12 +1029,15 @@ int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *,
  extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
  int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *,
                 size_t *, loff_t *);
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *, int,
+               void *, size_t *, loff_t *);
  int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
                 void *, size_t *, loff_t *);
  int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
                 void *, size_t *, loff_t *);
  int numa_zonelist_order_handler(struct ctl_table *, int,
                 void *, size_t *, loff_t *);
+extern int percpu_pagelist_high_fraction;
  extern char numa_zonelist_order[];
  #define NUMA_ZONELIST_ORDER_LEN        16
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 51213c3..69d925f 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2908,6 +2908,14 @@ static struct ctl_table vm_table[] = {
                 .extra1         = SYSCTL_ONE,
                 .extra2         = &one_thousand,
         },
+       {
+               .procname       = "percpu_pagelist_high_fraction",
+               .data           = &percpu_pagelist_high_fraction,
+               .maxlen         = sizeof(percpu_pagelist_high_fraction),
+               .mode           = 0644,
+               .proc_handler   = percpu_pagelist_high_fraction_sysctl_handler,
+               .extra1         = SYSCTL_ZERO,
+       },
         {
                 .procname       = "page_lock_unfairness",
                 .data           = &sysctl_page_lock_unfairness,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index adf35cc..cfc4071 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -120,6 +120,7 @@ typedef int __bitwise fpi_t;
  
  /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
  static DEFINE_MUTEX(pcp_batch_high_lock);
+#define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8)
  
  struct pagesets {
         local_lock_t lock;
@@ -192,6 +193,7 @@ EXPORT_SYMBOL(_totalram_pages);
  unsigned long totalreserve_pages __read_mostly;
  unsigned long totalcma_pages __read_mostly;
  
+int percpu_pagelist_high_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
  DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc);
  EXPORT_SYMBOL(init_on_alloc);
@@ -6725,17 +6727,32 @@ static int zone_highsize(struct zone *zone, int batch, int cpu_online)
  #ifdef CONFIG_MMU
         int high;
         int nr_local_cpus;
+       unsigned long total_pages;
+
+       if (!percpu_pagelist_high_fraction) {
+               /*
+                * By default, the high value of the pcp is based on the zone
+                * low watermark so that if they are full then background
+                * reclaim will not be started prematurely.
+                */
+               total_pages = low_wmark_pages(zone);
+       } else {
+               /*
+                * If percpu_pagelist_high_fraction is configured, the high
+                * value is based on a fraction of the managed pages in the
+                * zone.
+                */
+               total_pages = zone_managed_pages(zone) / percpu_pagelist_high_fraction;
+       }
  
         /*
-        * The high value of the pcp is based on the zone low watermark
-        * so that if they are full then background reclaim will not be
-        * started prematurely. The value is split across all online CPUs
-        * local to the zone. Note that early in boot that CPUs may not be
-        * online yet and that during CPU hotplug that the cpumask is not
-        * yet updated when a CPU is being onlined.
+        * Split the high value across all online CPUs local to the zone. Note
+        * that early in boot that CPUs may not be online yet and that during
+        * CPU hotplug that the cpumask is not yet updated when a CPU is being
+        * onlined.
          */
         nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
-       high = low_wmark_pages(zone) / nr_local_cpus;
+       high = total_pages / nr_local_cpus;
  
         /*
          * Ensure high is at least batch*4. The multiple is based on the
@@ -8500,6 +8517,44 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
         return 0;
  }
  
+/*
+ * percpu_pagelist_high_fraction - changes the pcp->high for each zone on each
+ * cpu. It is the fraction of total pages in each zone that a hot per cpu
+ * pagelist can have before it gets flushed back to buddy allocator.
+ */
+int percpu_pagelist_high_fraction_sysctl_handler(struct ctl_table *table,
+               int write, void *buffer, size_t *length, loff_t *ppos)
+{
+       struct zone *zone;
+       int old_percpu_pagelist_high_fraction;
+       int ret;
+
+       mutex_lock(&pcp_batch_high_lock);
+       old_percpu_pagelist_high_fraction = percpu_pagelist_high_fraction;
+
+       ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
+       if (!write || ret < 0)
+               goto out;
+
+       /* Sanity checking to avoid pcp imbalance */
+       if (percpu_pagelist_high_fraction &&
+           percpu_pagelist_high_fraction < MIN_PERCPU_PAGELIST_HIGH_FRACTION) {
+               percpu_pagelist_high_fraction = old_percpu_pagelist_high_fraction;
+               ret = -EINVAL;
+               goto out;
+       }
+
+       /* No change? */
+       if (percpu_pagelist_high_fraction == old_percpu_pagelist_high_fraction)
+               goto out;
+
+       for_each_populated_zone(zone)
+               zone_set_pageset_high_and_batch(zone, 0);
+out:
+       mutex_unlock(&pcp_batch_high_lock);
+       return ret;
+}
+
  #ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
  /*
   * Returns the number of pages that arch has reserved but
author	Mel Gorman <mgorman@techsingularity.net>
	Tue, 29 Jun 2021 02:42:24 +0000 (19:42 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 29 Jun 2021 17:53:55 +0000 (10:53 -0700)
Documentation/admin-guide/sysctl/vm.rst		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history