sched/topology: Skip updating masks for non-online nodes
authorValentin Schneider <valentin.schneider@arm.com>
Wed, 18 Aug 2021 07:43:33 +0000 (13:13 +0530)
committerPeter Zijlstra <peterz@infradead.org>
Fri, 20 Aug 2021 10:32:57 +0000 (12:32 +0200)
The scheduler currently expects NUMA node distances to be stable from
init onwards, and as a consequence builds the related data structures
once-and-for-all at init (see sched_init_numa()).

Unfortunately, on some architectures node distance is unreliable for
offline nodes and may very well change upon onlining.

Skip over offline nodes during sched_init_numa(). Track nodes that have
been onlined at least once, and trigger a build of a node's NUMA masks
when it is first onlined post-init.

Reported-by: Geetika Moolchandani <Geetika.Moolchandani1@ibm.com>
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Valentin Schneider <valentin.schneider@arm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20210818074333.48645-1-srikar@linux.vnet.ibm.com
kernel/sched/topology.c

index b77ad49..4e8698e 100644 (file)
@@ -1482,6 +1482,8 @@ int                               sched_max_numa_distance;
 static int                     *sched_domains_numa_distance;
 static struct cpumask          ***sched_domains_numa_masks;
 int __read_mostly              node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
 #endif
 
 /*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
                        sched_domains_numa_masks[i][j] = mask;
 
                        for_each_node(k) {
+                               /*
+                                * Distance information can be unreliable for
+                                * offline nodes, defer building the node
+                                * masks to its bringup.
+                                * This relies on all unique distance values
+                                * still being visible at init time.
+                                */
+                               if (!node_online(j))
+                                       continue;
+
                                if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
                                        sched_numa_warn("Node-distance not symmetric");
 
@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
        sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
 
        init_numa_topology_type();
+
+       sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+       if (!sched_numa_onlined_nodes)
+               return;
+
+       bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+       for_each_online_node(i)
+               bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+       int i, j;
+
+       /*
+        * NUMA masks are not built for offline nodes in sched_init_numa().
+        * Thus, when a CPU of a never-onlined-before node gets plugged in,
+        * adding that new CPU to the right NUMA masks is not sufficient: the
+        * masks of that CPU's node must also be updated.
+        */
+       if (test_bit(node, sched_numa_onlined_nodes))
+               return;
+
+       bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+       for (i = 0; i < sched_domains_numa_levels; i++) {
+               for (j = 0; j < nr_node_ids; j++) {
+                       if (!node_online(j) || node == j)
+                               continue;
+
+                       if (node_distance(j, node) > sched_domains_numa_distance[i])
+                               continue;
+
+                       /* Add remote nodes in our masks */
+                       cpumask_or(sched_domains_numa_masks[i][node],
+                                  sched_domains_numa_masks[i][node],
+                                  sched_domains_numa_masks[0][j]);
+               }
+       }
+
+       /*
+        * A new node has been brought up, potentially changing the topology
+        * classification.
+        *
+        * Note that this is racy vs any use of sched_numa_topology_type :/
+        */
+       init_numa_topology_type();
 }
 
 void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
        int node = cpu_to_node(cpu);
        int i, j;
 
+       __sched_domains_numa_masks_set(node);
+
        for (i = 0; i < sched_domains_numa_levels; i++) {
                for (j = 0; j < nr_node_ids; j++) {
+                       if (!node_online(j))
+                               continue;
+
+                       /* Set ourselves in the remote node's masks */
                        if (node_distance(j, node) <= sched_domains_numa_distance[i])
                                cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
                }