Merge tag 'thermal-5.17-rc1-2' of git://git.kernel.org/pub/scm/linux/kernel/git/rafae...
[linux-2.6-microblaze.git] / mm / migrate.c
index 7079e6b..18ce840 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/ptrace.h>
 #include <linux/oom.h>
 #include <linux/memory.h>
+#include <linux/random.h>
 
 #include <asm/tlbflush.h>
 
@@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 
                        pte = pte_mkhuge(pte);
                        pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
-                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                        if (PageAnon(new))
                                hugepage_add_anon_rmap(new, vma, pvmw.address);
                        else
                                page_dup_rmap(new, true);
+                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                } else
 #endif
                {
-                       set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
-
                        if (PageAnon(new))
                                page_add_anon_rmap(new, vma, pvmw.address, false);
                        else
                                page_add_file_rmap(new, false);
+                       set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                }
                if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
                        mlock_vma_page(new);
@@ -1084,80 +1084,6 @@ out:
        return rc;
 }
 
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets.  Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node.  The
- * CPUs are placed in the node with the "fast" memory.  The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- *     Socket A: 0, 1, 2
- *     Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1.  When Node 1 fills up, it should be migrated to
- * Node 2.  The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- *     0 -> 1 -> 2 -> stop
- *     3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- *     {  1, // Node 0 migrates to 1
- *        2, // Node 1 migrates to 2
- *       -1, // Node 2 does not migrate
- *        4, // Node 3 migrates to 4
- *        5, // Node 4 migrates to 5
- *       -1} // Node 5 does not migrate
- */
-
-/*
- * Writes to this array occur without locking.  Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-static int node_demotion[MAX_NUMNODES] __read_mostly =
-       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
-       int target;
-
-       /*
-        * node_demotion[] is updated without excluding this
-        * function from running.  RCU doesn't provide any
-        * compiler barriers, so the READ_ONCE() is required
-        * to avoid compiler reordering or read merging.
-        *
-        * Make sure to use RCU over entire code blocks if
-        * node_demotion[] reads need to be consistent.
-        */
-       rcu_read_lock();
-       target = READ_ONCE(node_demotion[node]);
-       rcu_read_unlock();
-
-       return target;
-}
-
 /*
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
@@ -1413,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * @mode:              The migration mode that specifies the constraints for
  *                     page migration, if any.
  * @reason:            The reason for page migration.
- * @ret_succeeded:     Set to the number of pages migrated successfully if
+ * @ret_succeeded:     Set to the number of normal pages migrated successfully if
  *                     the caller passes a non-NULL pointer.
  *
  * The function returns after 10 attempts or if no pages are movable any more
@@ -1421,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * It is caller's responsibility to call putback_movable_pages() to return pages
  * to the LRU or free list only if ret != 0.
  *
- * Returns the number of pages that were not migrated, or an error code.
+ * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
+ * an error code. The number of THP splits will be considered as the number of
+ * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
                free_page_t put_new_page, unsigned long private,
@@ -1430,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int retry = 1;
        int thp_retry = 1;
        int nr_failed = 0;
+       int nr_failed_pages = 0;
        int nr_succeeded = 0;
        int nr_thp_succeeded = 0;
        int nr_thp_failed = 0;
@@ -1441,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int swapwrite = current->flags & PF_SWAPWRITE;
        int rc, nr_subpages;
        LIST_HEAD(ret_pages);
+       LIST_HEAD(thp_split_pages);
        bool nosplit = (reason == MR_NUMA_MISPLACED);
+       bool no_subpage_counting = false;
 
        trace_mm_migrate_pages_start(mode, reason);
 
        if (!swapwrite)
                current->flags |= PF_SWAPWRITE;
 
+thp_subpage_migration:
        for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
                retry = 0;
                thp_retry = 0;
@@ -1460,7 +1392,7 @@ retry:
                         * during migration.
                         */
                        is_thp = PageTransHuge(page) && !PageHuge(page);
-                       nr_subpages = thp_nr_pages(page);
+                       nr_subpages = compound_nr(page);
                        cond_resched();
 
                        if (PageHuge(page))
@@ -1496,18 +1428,20 @@ retry:
                        case -ENOSYS:
                                /* THP migration is unsupported */
                                if (is_thp) {
-                                       if (!try_split_thp(page, &page2, from)) {
+                                       nr_thp_failed++;
+                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
                                                nr_thp_split++;
                                                goto retry;
                                        }
 
-                                       nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        break;
                                }
 
                                /* Hugetlb migration is unsupported */
-                               nr_failed++;
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                break;
                        case -ENOMEM:
                                /*
@@ -1516,16 +1450,19 @@ retry:
                                 * THP NUMA faulting doesn't split THP to retry.
                                 */
                                if (is_thp && !nosplit) {
-                                       if (!try_split_thp(page, &page2, from)) {
+                                       nr_thp_failed++;
+                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
                                                nr_thp_split++;
                                                goto retry;
                                        }
 
-                                       nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        goto out;
                                }
-                               nr_failed++;
+
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                goto out;
                        case -EAGAIN:
                                if (is_thp) {
@@ -1535,12 +1472,11 @@ retry:
                                retry++;
                                break;
                        case MIGRATEPAGE_SUCCESS:
+                               nr_succeeded += nr_subpages;
                                if (is_thp) {
                                        nr_thp_succeeded++;
-                                       nr_succeeded += nr_subpages;
                                        break;
                                }
-                               nr_succeeded++;
                                break;
                        default:
                                /*
@@ -1551,17 +1487,37 @@ retry:
                                 */
                                if (is_thp) {
                                        nr_thp_failed++;
-                                       nr_failed += nr_subpages;
+                                       nr_failed_pages += nr_subpages;
                                        break;
                                }
-                               nr_failed++;
+
+                               if (!no_subpage_counting)
+                                       nr_failed++;
+                               nr_failed_pages += nr_subpages;
                                break;
                        }
                }
        }
-       nr_failed += retry + thp_retry;
+       nr_failed += retry;
        nr_thp_failed += thp_retry;
-       rc = nr_failed;
+       /*
+        * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
+        * counting in this round, since all subpages of a THP is counted
+        * as 1 failure in the first round.
+        */
+       if (!list_empty(&thp_split_pages)) {
+               /*
+                * Move non-migrated pages (after 10 retries) to ret_pages
+                * to avoid migrating them again.
+                */
+               list_splice_init(from, &ret_pages);
+               list_splice_init(&thp_split_pages, from);
+               no_subpage_counting = true;
+               retry = 1;
+               goto thp_subpage_migration;
+       }
+
+       rc = nr_failed + nr_thp_failed;
 out:
        /*
         * Put the permanent failure page back to migration list, they
@@ -1570,11 +1526,11 @@ out:
        list_splice(&ret_pages, from);
 
        count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
-       count_vm_events(PGMIGRATE_FAIL, nr_failed);
+       count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
        count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
        count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
        count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
-       trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+       trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
                               nr_thp_failed, nr_thp_split, mode, reason);
 
        if (!swapwrite)
@@ -2516,8 +2472,7 @@ static bool migrate_vma_check_page(struct page *page)
 static void migrate_vma_unmap(struct migrate_vma *migrate)
 {
        const unsigned long npages = migrate->npages;
-       const unsigned long start = migrate->start;
-       unsigned long addr, i, restore = 0;
+       unsigned long i, restore = 0;
        bool allow_drain = true;
 
        lru_add_drain();
@@ -2563,7 +2518,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
                }
        }
 
-       for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
+       for (i = 0; i < npages && restore; i++) {
                struct page *page = migrate_pfn_to_page(migrate->src[i]);
 
                if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
@@ -2961,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
 
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
+ *     {  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
+ *     {  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
+ *     {  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
+ *     {  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
+ *     {  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
+ *
+ * Moreover some systems may have multiple slow memory nodes.
+ * Suppose a system has one socket with 3 memory nodes, node 0
+ * is fast memory type, and node 1/2 both are slow memory
+ * type, and the distance between fast memory node and slow
+ * memory node is same. So the migration path should be:
+ *
+ *     0 -> 1/2 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *     { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
+ *     { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
+ *     { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+#define DEFAULT_DEMOTION_TARGET_NODES 15
+
+#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
+#define DEMOTION_TARGET_NODES  (MAX_NUMNODES - 1)
+#else
+#define DEMOTION_TARGET_NODES  DEFAULT_DEMOTION_TARGET_NODES
+#endif
+
+struct demotion_nodes {
+       unsigned short nr;
+       short nodes[DEMOTION_TARGET_NODES];
+};
+
+static struct demotion_nodes *node_demotion __read_mostly;
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       struct demotion_nodes *nd;
+       unsigned short target_nr, index;
+       int target;
+
+       if (!node_demotion)
+               return NUMA_NO_NODE;
+
+       nd = &node_demotion[node];
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target_nr = READ_ONCE(nd->nr);
+
+       switch (target_nr) {
+       case 0:
+               target = NUMA_NO_NODE;
+               goto out;
+       case 1:
+               index = 0;
+               break;
+       default:
+               /*
+                * If there are multiple target nodes, just select one
+                * target node randomly.
+                *
+                * In addition, we can also use round-robin to select
+                * target node, but we should introduce another variable
+                * for node_demotion[] to record last selected target node,
+                * that may cause cache ping-pong due to the changing of
+                * last target node. Or introducing per-cpu data to avoid
+                * caching issue, which seems more complicated. So selecting
+                * target node randomly seems better until now.
+                */
+               index = get_random_int() % target_nr;
+               break;
+       }
+
+       target = READ_ONCE(nd->nodes[index]);
+
+out:
+       rcu_read_unlock();
+       return target;
+}
+
 #if defined(CONFIG_HOTPLUG_CPU)
 /* Disable reclaim-based migration. */
 static void __disable_all_migrate_targets(void)
 {
-       int node;
+       int node, i;
+
+       if (!node_demotion)
+               return;
 
-       for_each_online_node(node)
-               node_demotion[node] = NUMA_NO_NODE;
+       for_each_online_node(node) {
+               node_demotion[node].nr = 0;
+               for (i = 0; i < DEMOTION_TARGET_NODES; i++)
+                       node_demotion[node].nodes[i] = NUMA_NO_NODE;
+       }
 }
 
 static void disable_all_migrate_targets(void)
@@ -2995,26 +3088,40 @@ static void disable_all_migrate_targets(void)
  * Failing here is OK.  It might just indicate
  * being at the end of a chain.
  */
-static int establish_migrate_target(int node, nodemask_t *used)
+static int establish_migrate_target(int node, nodemask_t *used,
+                                   int best_distance)
 {
-       int migration_target;
+       int migration_target, index, val;
+       struct demotion_nodes *nd;
 
-       /*
-        * Can not set a migration target on a
-        * node with it already set.
-        *
-        * No need for READ_ONCE() here since this
-        * in the write path for node_demotion[].
-        * This should be the only thread writing.
-        */
-       if (node_demotion[node] != NUMA_NO_NODE)
+       if (!node_demotion)
                return NUMA_NO_NODE;
 
+       nd = &node_demotion[node];
+
        migration_target = find_next_best_node(node, used);
        if (migration_target == NUMA_NO_NODE)
                return NUMA_NO_NODE;
 
-       node_demotion[node] = migration_target;
+       /*
+        * If the node has been set a migration target node before,
+        * which means it's the best distance between them. Still
+        * check if this node can be demoted to other target nodes
+        * if they have a same best distance.
+        */
+       if (best_distance != -1) {
+               val = node_distance(node, migration_target);
+               if (val > best_distance)
+                       return NUMA_NO_NODE;
+       }
+
+       index = nd->nr;
+       if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
+                     "Exceeds maximum demotion target nodes\n"))
+               return NUMA_NO_NODE;
+
+       nd->nodes[index] = migration_target;
+       nd->nr++;
 
        return migration_target;
 }
@@ -3030,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used)
  *
  * The difference here is that cycles must be avoided.  If
  * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0.
+ * node1 migrates to can migrate to node0. Also one node can
+ * be migrated to multiple nodes if the target nodes all have
+ * a same best-distance against the source node.
  *
  * This function can run simultaneously with readers of
  * node_demotion[].  However, it can not run simultaneously
@@ -3042,7 +3151,7 @@ static void __set_migration_target_nodes(void)
        nodemask_t next_pass    = NODE_MASK_NONE;
        nodemask_t this_pass    = NODE_MASK_NONE;
        nodemask_t used_targets = NODE_MASK_NONE;
-       int node;
+       int node, best_distance;
 
        /*
         * Avoid any oddities like cycles that could occur
@@ -3071,18 +3180,33 @@ again:
         * multiple source nodes to share a destination.
         */
        nodes_or(used_targets, used_targets, this_pass);
-       for_each_node_mask(node, this_pass) {
-               int target_node = establish_migrate_target(node, &used_targets);
 
-               if (target_node == NUMA_NO_NODE)
-                       continue;
+       for_each_node_mask(node, this_pass) {
+               best_distance = -1;
 
                /*
-                * Visit targets from this pass in the next pass.
-                * Eventually, every node will have been part of
-                * a pass, and will become set in 'used_targets'.
+                * Try to set up the migration path for the node, and the target
+                * migration nodes can be multiple, so doing a loop to find all
+                * the target nodes if they all have a best node distance.
                 */
-               node_set(target_node, next_pass);
+               do {
+                       int target_node =
+                               establish_migrate_target(node, &used_targets,
+                                                        best_distance);
+
+                       if (target_node == NUMA_NO_NODE)
+                               break;
+
+                       if (best_distance == -1)
+                               best_distance = node_distance(node, target_node);
+
+                       /*
+                        * Visit targets from this pass in the next pass.
+                        * Eventually, every node will have been part of
+                        * a pass, and will become set in 'used_targets'.
+                        */
+                       node_set(target_node, next_pass);
+               } while (1);
        }
        /*
         * 'next_pass' contains nodes which became migration
@@ -3183,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void)
 {
        int ret;
 
+       node_demotion = kmalloc_array(nr_node_ids,
+                                     sizeof(struct demotion_nodes),
+                                     GFP_KERNEL);
+       WARN_ON(!node_demotion);
+
        ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
                                        NULL, migration_offline_cpu);
        /*