mm/page_alloc: adjust pcp->high after CPU hotplug events
authorMel Gorman <mgorman@techsingularity.net>
Tue, 29 Jun 2021 02:42:15 +0000 (19:42 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:54 +0000 (10:53 -0700)
The PCP high watermark is based on the number of online CPUs so the
watermarks must be adjusted during CPU hotplug.  At the time of
hot-remove, the number of online CPUs is already adjusted but during
hot-add, a delta needs to be applied to update PCP to the correct value.
After this patch is applied, the high watermarks are adjusted correctly.

  # grep high: /proc/zoneinfo  | tail -1
              high:  649
  # echo 0 > /sys/devices/system/cpu/cpu4/online
  # grep high: /proc/zoneinfo  | tail -1
              high:  664
  # echo 1 > /sys/devices/system/cpu/cpu4/online
  # grep high: /proc/zoneinfo  | tail -1
              high:  649

Link: https://lkml.kernel.org/r/20210525080119.5455-4-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hillf Danton <hdanton@sina.com>
Cc: Michal Hocko <mhocko@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/cpuhotplug.h
mm/internal.h
mm/page_alloc.c

index 4a62b39..47e1358 100644 (file)
@@ -54,7 +54,7 @@ enum cpuhp_state {
        CPUHP_MM_MEMCQ_DEAD,
        CPUHP_PERCPU_CNT_DEAD,
        CPUHP_RADIX_DEAD,
-       CPUHP_PAGE_ALLOC_DEAD,
+       CPUHP_PAGE_ALLOC,
        CPUHP_NET_DEV_DEAD,
        CPUHP_PCI_XGENE_DEAD,
        CPUHP_IOMMU_IOVA_DEAD,
index 2946dfa..18e5fb4 100644 (file)
@@ -206,7 +206,7 @@ extern int user_min_free_kbytes;
 extern void free_unref_page(struct page *page);
 extern void free_unref_page_list(struct list_head *list);
 
-extern void zone_pcp_update(struct zone *zone);
+extern void zone_pcp_update(struct zone *zone, int cpu_online);
 extern void zone_pcp_reset(struct zone *zone);
 extern void zone_pcp_disable(struct zone *zone);
 extern void zone_pcp_enable(struct zone *zone);
index 19ec81d..8d196a8 100644 (file)
@@ -6667,7 +6667,7 @@ static int zone_batchsize(struct zone *zone)
 #endif
 }
 
-static int zone_highsize(struct zone *zone, int batch)
+static int zone_highsize(struct zone *zone, int batch, int cpu_online)
 {
 #ifdef CONFIG_MMU
        int high;
@@ -6678,9 +6678,10 @@ static int zone_highsize(struct zone *zone, int batch)
         * so that if they are full then background reclaim will not be
         * started prematurely. The value is split across all online CPUs
         * local to the zone. Note that early in boot that CPUs may not be
-        * online yet.
+        * online yet and that during CPU hotplug that the cpumask is not
+        * yet updated when a CPU is being onlined.
         */
-       nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone))));
+       nr_local_cpus = max(1U, cpumask_weight(cpumask_of_node(zone_to_nid(zone)))) + cpu_online;
        high = low_wmark_pages(zone) / nr_local_cpus;
 
        /*
@@ -6754,12 +6755,12 @@ static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long h
  * Calculate and set new high and batch values for all per-cpu pagesets of a
  * zone based on the zone's size.
  */
-static void zone_set_pageset_high_and_batch(struct zone *zone)
+static void zone_set_pageset_high_and_batch(struct zone *zone, int cpu_online)
 {
        int new_high, new_batch;
 
        new_batch = max(1, zone_batchsize(zone));
-       new_high = zone_highsize(zone, new_batch);
+       new_high = zone_highsize(zone, new_batch, cpu_online);
 
        if (zone->pageset_high == new_high &&
            zone->pageset_batch == new_batch)
@@ -6789,7 +6790,7 @@ void __meminit setup_zone_pageset(struct zone *zone)
                per_cpu_pages_init(pcp, pzstats);
        }
 
-       zone_set_pageset_high_and_batch(zone);
+       zone_set_pageset_high_and_batch(zone, 0);
 }
 
 /*
@@ -8044,6 +8045,7 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
 
 static int page_alloc_cpu_dead(unsigned int cpu)
 {
+       struct zone *zone;
 
        lru_add_drain_cpu(cpu);
        drain_pages(cpu);
@@ -8064,6 +8066,19 @@ static int page_alloc_cpu_dead(unsigned int cpu)
         * race with what we are doing.
         */
        cpu_vm_stats_fold(cpu);
+
+       for_each_populated_zone(zone)
+               zone_pcp_update(zone, 0);
+
+       return 0;
+}
+
+static int page_alloc_cpu_online(unsigned int cpu)
+{
+       struct zone *zone;
+
+       for_each_populated_zone(zone)
+               zone_pcp_update(zone, 1);
        return 0;
 }
 
@@ -8089,8 +8104,9 @@ void __init page_alloc_init(void)
                hashdist = 0;
 #endif
 
-       ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
-                                       "mm/page_alloc:dead", NULL,
+       ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC,
+                                       "mm/page_alloc:pcp",
+                                       page_alloc_cpu_online,
                                        page_alloc_cpu_dead);
        WARN_ON(ret < 0);
 }
@@ -8252,7 +8268,7 @@ void setup_per_zone_wmarks(void)
         * and high limits or the limits may be inappropriate.
         */
        for_each_zone(zone)
-               zone_pcp_update(zone);
+               zone_pcp_update(zone, 0);
 }
 
 /*
@@ -9053,10 +9069,10 @@ EXPORT_SYMBOL(free_contig_range);
  * The zone indicated has a new number of managed_pages; batch sizes and percpu
  * page high values need to be recalculated.
  */
-void __meminit zone_pcp_update(struct zone *zone)
+void zone_pcp_update(struct zone *zone, int cpu_online)
 {
        mutex_lock(&pcp_batch_high_lock);
-       zone_set_pageset_high_and_batch(zone);
+       zone_set_pageset_high_and_batch(zone, cpu_online);
        mutex_unlock(&pcp_batch_high_lock);
 }