Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block

[linux-2.6-microblaze.git] / mm / migrate.c
diff --git a/mm/migrate.c b/mm/migrate.c

index 7e24043..a6a7743 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -49,6 +49,7 @@
  #include <linux/sched/mm.h>
  #include <linux/ptrace.h>
  #include <linux/oom.h>
+#include <linux/memory.h>
  
  #include <asm/tlbflush.h>
  
@@ -959,7 +960,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                                 int force, enum migrate_mode mode)
  {
         int rc = -EAGAIN;
-       int page_was_mapped = 0;
+       bool page_was_mapped = false;
         struct anon_vma *anon_vma = NULL;
         bool is_lru = !__PageMovable(page);
  
@@ -1007,7 +1008,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
         }
  
         /*
-        * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+        * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
          * we cannot notice that anon_vma is freed while we migrates a page.
          * This get_anon_vma() delays freeing anon_vma pointer until the end
          * of migration. File cache pages are no problem because of page_lock()
@@ -1062,7 +1063,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                 VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
                                 page);
                 try_to_migrate(page, 0);
-               page_was_mapped = 1;
+               page_was_mapped = true;
         }
  
         if (!page_mapped(page))
@@ -1099,6 +1100,80 @@ out:
         return rc;
  }
  
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  1, // Node 0 migrates to 1
+ *        2, // Node 1 migrates to 2
+ *       -1, // Node 2 does not migrate
+ *        4, // Node 3 migrates to 4
+ *        5, // Node 4 migrates to 5
+ *       -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       int target;
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target = READ_ONCE(node_demotion[node]);
+       rcu_read_unlock();
+
+       return target;
+}
+
  /*
   * Obtain the lock on page, remove all ptes and migrate the page
   * to the newly allocated page in newpage.
@@ -1354,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
   * @mode:              The migration mode that specifies the constraints for
   *                     page migration, if any.
   * @reason:            The reason for page migration.
+ * @ret_succeeded:     Set to the number of pages migrated successfully if
+ *                     the caller passes a non-NULL pointer.
   *
   * The function returns after 10 attempts or if no pages are movable any more
   * because the list has become empty or no retryable pages exist any more.
@@ -1364,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
   */
  int migrate_pages(struct list_head *from, new_page_t get_new_page,
                 free_page_t put_new_page, unsigned long private,
-               enum migrate_mode mode, int reason)
+               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
  {
         int retry = 1;
         int thp_retry = 1;
@@ -1519,6 +1596,9 @@ out:
         if (!swapwrite)
                 current->flags &= ~PF_SWAPWRITE;
  
+       if (ret_succeeded)
+               *ret_succeeded = nr_succeeded;
+
         return rc;
  }
  
@@ -1588,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
         };
  
         err = migrate_pages(pagelist, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+               (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
         if (err)
                 putback_movable_pages(pagelist);
         return err;
@@ -1820,6 +1900,23 @@ set_status:
         mmap_read_unlock(mm);
  }
  
+static int get_compat_pages_array(const void __user *chunk_pages[],
+                                 const void __user * __user *pages,
+                                 unsigned long chunk_nr)
+{
+       compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
+       compat_uptr_t p;
+       int i;
+
+       for (i = 0; i < chunk_nr; i++) {
+               if (get_user(p, pages32 + i))
+                       return -EFAULT;
+               chunk_pages[i] = compat_ptr(p);
+       }
+
+       return 0;
+}
+
  /*
   * Determine the nodes of a user array of pages and store it in
   * a user array of status.
@@ -1839,8 +1936,15 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
                 if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
                         chunk_nr = DO_PAGES_STAT_CHUNK_NR;
  
-               if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
-                       break;
+               if (in_compat_syscall()) {
+                       if (get_compat_pages_array(chunk_pages, pages,
+                                                  chunk_nr))
+                               break;
+               } else {
+                       if (copy_from_user(chunk_pages, pages,
+                                     chunk_nr * sizeof(*chunk_pages)))
+                               break;
+               }
  
                 do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
  
@@ -1943,28 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
         return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
  }
  
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-                      compat_uptr_t __user *, pages32,
-                      const int __user *, nodes,
-                      int __user *, status,
-                      int, flags)
-{
-       const void __user * __user *pages;
-       int i;
-
-       pages = compat_alloc_user_space(nr_pages * sizeof(void *));
-       for (i = 0; i < nr_pages; i++) {
-               compat_uptr_t p;
-
-               if (get_user(p, pages32 + i) ||
-                       put_user(compat_ptr(p), pages + i))
-                       return -EFAULT;
-       }
-       return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-#endif /* CONFIG_COMPAT */
-
  #ifdef CONFIG_NUMA_BALANCING
  /*
   * Returns true if this is a safe migration target node for misplaced NUMA
@@ -2027,6 +2109,7 @@ out:
  static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
  {
         int page_lru;
+       int nr_pages = thp_nr_pages(page);
  
         VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
  
@@ -2035,7 +2118,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
                 return 0;
  
         /* Avoid migrating to a node that is nearly full */
-       if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
+       if (!migrate_balanced_pgdat(pgdat, nr_pages))
                 return 0;
  
         if (isolate_lru_page(page))
@@ -2043,7 +2126,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
  
         page_lru = page_is_file_lru(page);
         mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
-                               thp_nr_pages(page));
+                           nr_pages);
  
         /*
          * Isolating the page has taken another reference, so the
@@ -2103,7 +2186,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
  
         list_add(&page->lru, &migratepages);
         nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
-                                    MIGRATE_ASYNC, MR_NUMA_MISPLACED);
+                                    MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
         if (nr_remaining) {
                 if (!list_empty(&migratepages)) {
                         list_del(&page->lru);
@@ -2982,3 +3065,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
  }
  EXPORT_SYMBOL(migrate_vma_finalize);
  #endif /* CONFIG_DEVICE_PRIVATE */
+
+#if defined(CONFIG_MEMORY_HOTPLUG)
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+       int node;
+
+       for_each_online_node(node)
+               node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+       __disable_all_migrate_targets();
+
+       /*
+        * Ensure that the "disable" is visible across the system.
+        * Readers will see either a combination of before+disable
+        * state or disable+after.  They will never see before and
+        * after state together.
+        *
+        * The before+after state together might have cycles and
+        * could cause readers to do things like loop until this
+        * function finishes.  This ensures they can only see a
+        * single "bad" read and would, for instance, only loop
+        * once.
+        */
+       synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK.  It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+       int migration_target;
+
+       /*
+        * Can not set a migration target on a
+        * node with it already set.
+        *
+        * No need for READ_ONCE() here since this
+        * in the write path for node_demotion[].
+        * This should be the only thread writing.
+        */
+       if (node_demotion[node] != NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       migration_target = find_next_best_node(node, used);
+       if (migration_target == NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       node_demotion[node] = migration_target;
+
+       return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided.  If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[].  However, it can not run simultaneously
+ * with itself.  Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+       nodemask_t next_pass    = NODE_MASK_NONE;
+       nodemask_t this_pass    = NODE_MASK_NONE;
+       nodemask_t used_targets = NODE_MASK_NONE;
+       int node;
+
+       /*
+        * Avoid any oddities like cycles that could occur
+        * from changes in the topology.  This will leave
+        * a momentary gap when migration is disabled.
+        */
+       disable_all_migrate_targets();
+
+       /*
+        * Allocations go close to CPUs, first.  Assume that
+        * the migration path starts at the nodes with CPUs.
+        */
+       next_pass = node_states[N_CPU];
+again:
+       this_pass = next_pass;
+       next_pass = NODE_MASK_NONE;
+       /*
+        * To avoid cycles in the migration "graph", ensure
+        * that migration sources are not future targets by
+        * setting them in 'used_targets'.  Do this only
+        * once per pass so that multiple source nodes can
+        * share a target node.
+        *
+        * 'used_targets' will become unavailable in future
+        * passes.  This limits some opportunities for
+        * multiple source nodes to share a destination.
+        */
+       nodes_or(used_targets, used_targets, this_pass);
+       for_each_node_mask(node, this_pass) {
+               int target_node = establish_migrate_target(node, &used_targets);
+
+               if (target_node == NUMA_NO_NODE)
+                       continue;
+
+               /*
+                * Visit targets from this pass in the next pass.
+                * Eventually, every node will have been part of
+                * a pass, and will become set in 'used_targets'.
+                */
+               node_set(target_node, next_pass);
+       }
+       /*
+        * 'next_pass' contains nodes which became migration
+        * targets in this pass.  Make additional passes until
+        * no more migrations targets are available.
+        */
+       if (!nodes_empty(next_pass))
+               goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
+{
+       get_online_mems();
+       __set_migration_target_nodes();
+       put_online_mems();
+}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs.  That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems().  That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+                                                unsigned long action, void *arg)
+{
+       switch (action) {
+       case MEM_GOING_OFFLINE:
+               /*
+                * Make sure there are not transient states where
+                * an offline node is a migration target.  This
+                * will leave migration disabled until the offline
+                * completes and the MEM_OFFLINE case below runs.
+                */
+               disable_all_migrate_targets();
+               break;
+       case MEM_OFFLINE:
+       case MEM_ONLINE:
+               /*
+                * Recalculate the target nodes once the node
+                * reaches its final state (online or offline).
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_CANCEL_OFFLINE:
+               /*
+                * MEM_GOING_OFFLINE disabled all the migration
+                * targets.  Reenable them.
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_GOING_ONLINE:
+       case MEM_CANCEL_ONLINE:
+               break;
+       }
+
+       return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+       int ret;
+
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+                               migration_online_cpu,
+                               migration_offline_cpu);
+       /*
+        * In the unlikely case that this fails, the automatic
+        * migration targets may become suboptimal for nodes
+        * where N_CPU changes.  With such a small impact in a
+        * rare case, do not bother trying to do anything special.
+        */
+       WARN_ON(ret < 0);
+
+       hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+       return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */