mm/numa: automatically generate node migration order

[linux-2.6-microblaze.git] / mm / migrate.c
diff --git a/mm/migrate.c b/mm/migrate.c

index 7e24043..57aeb9b 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1099,6 +1099,80 @@ out:
         return rc;
  }
  
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  1, // Node 0 migrates to 1
+ *        2, // Node 1 migrates to 2
+ *       -1, // Node 2 does not migrate
+ *        4, // Node 3 migrates to 4
+ *        5, // Node 4 migrates to 5
+ *       -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * @returns: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       int target;
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target = READ_ONCE(node_demotion[node]);
+       rcu_read_unlock();
+
+       return target;
+}
+
  /*
   * Obtain the lock on page, remove all ptes and migrate the page
   * to the newly allocated page in newpage.
@@ -2982,3 +3056,145 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
  }
  EXPORT_SYMBOL(migrate_vma_finalize);
  #endif /* CONFIG_DEVICE_PRIVATE */
+
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+       int node;
+
+       for_each_online_node(node)
+               node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+       __disable_all_migrate_targets();
+
+       /*
+        * Ensure that the "disable" is visible across the system.
+        * Readers will see either a combination of before+disable
+        * state or disable+after.  They will never see before and
+        * after state together.
+        *
+        * The before+after state together might have cycles and
+        * could cause readers to do things like loop until this
+        * function finishes.  This ensures they can only see a
+        * single "bad" read and would, for instance, only loop
+        * once.
+        */
+       synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK.  It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+       int migration_target;
+
+       /*
+        * Can not set a migration target on a
+        * node with it already set.
+        *
+        * No need for READ_ONCE() here since this
+        * in the write path for node_demotion[].
+        * This should be the only thread writing.
+        */
+       if (node_demotion[node] != NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       migration_target = find_next_best_node(node, used);
+       if (migration_target == NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       node_demotion[node] = migration_target;
+
+       return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided.  If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[].  However, it can not run simultaneously
+ * with itself.  Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+       nodemask_t next_pass    = NODE_MASK_NONE;
+       nodemask_t this_pass    = NODE_MASK_NONE;
+       nodemask_t used_targets = NODE_MASK_NONE;
+       int node;
+
+       /*
+        * Avoid any oddities like cycles that could occur
+        * from changes in the topology.  This will leave
+        * a momentary gap when migration is disabled.
+        */
+       disable_all_migrate_targets();
+
+       /*
+        * Allocations go close to CPUs, first.  Assume that
+        * the migration path starts at the nodes with CPUs.
+        */
+       next_pass = node_states[N_CPU];
+again:
+       this_pass = next_pass;
+       next_pass = NODE_MASK_NONE;
+       /*
+        * To avoid cycles in the migration "graph", ensure
+        * that migration sources are not future targets by
+        * setting them in 'used_targets'.  Do this only
+        * once per pass so that multiple source nodes can
+        * share a target node.
+        *
+        * 'used_targets' will become unavailable in future
+        * passes.  This limits some opportunities for
+        * multiple source nodes to share a destination.
+        */
+       nodes_or(used_targets, used_targets, this_pass);
+       for_each_node_mask(node, this_pass) {
+               int target_node = establish_migrate_target(node, &used_targets);
+
+               if (target_node == NUMA_NO_NODE)
+                       continue;
+
+               /*
+                * Visit targets from this pass in the next pass.
+                * Eventually, every node will have been part of
+                * a pass, and will become set in 'used_targets'.
+                */
+               node_set(target_node, next_pass);
+       }
+       /*
+        * 'next_pass' contains nodes which became migration
+        * targets in this pass.  Make additional passes until
+        * no more migrations targets are available.
+        */
+       if (!nodes_empty(next_pass))
+               goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+__maybe_unused // <- temporay to prevent warnings during bisects
+static void set_migration_target_nodes(void)
+{
+       get_online_mems();
+       __set_migration_target_nodes();
+       put_online_mems();
+}