Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / mm / migrate.c
index 41ff2c9..a6a7743 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/sched/mm.h>
 #include <linux/ptrace.h>
 #include <linux/oom.h>
+#include <linux/memory.h>
 
 #include <asm/tlbflush.h>
 
@@ -210,13 +211,18 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
                 * Recheck VMA as permissions can change since migration started
                 */
                entry = pte_to_swp_entry(*pvmw.pte);
-               if (is_write_migration_entry(entry))
+               if (is_writable_migration_entry(entry))
                        pte = maybe_mkwrite(pte, vma);
                else if (pte_swp_uffd_wp(*pvmw.pte))
                        pte = pte_mkuffd_wp(pte);
 
                if (unlikely(is_device_private_page(new))) {
-                       entry = make_device_private_entry(new, pte_write(pte));
+                       if (pte_write(pte))
+                               entry = make_writable_device_private_entry(
+                                                       page_to_pfn(new));
+                       else
+                               entry = make_readable_device_private_entry(
+                                                       page_to_pfn(new));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(*pvmw.pte))
                                pte = pte_swp_mksoft_dirty(pte);
@@ -226,8 +232,10 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
 
 #ifdef CONFIG_HUGETLB_PAGE
                if (PageHuge(new)) {
+                       unsigned int shift = huge_page_shift(hstate_vma(vma));
+
                        pte = pte_mkhuge(pte);
-                       pte = arch_make_huge_pte(pte, vma, new, 0);
+                       pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                        set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                        if (PageAnon(new))
                                hugepage_add_anon_rmap(new, vma, pvmw.address);
@@ -294,7 +302,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
        if (!is_migration_entry(entry))
                goto out;
 
-       page = migration_entry_to_page(entry);
+       page = pfn_swap_entry_to_page(entry);
        page = compound_head(page);
 
        /*
@@ -335,7 +343,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
        ptl = pmd_lock(mm, pmd);
        if (!is_pmd_migration_entry(*pmd))
                goto unlock;
-       page = migration_entry_to_page(pmd_to_swp_entry(*pmd));
+       page = pfn_swap_entry_to_page(pmd_to_swp_entry(*pmd));
        if (!get_page_unless_zero(page))
                goto unlock;
        spin_unlock(ptl);
@@ -529,54 +537,6 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
        return MIGRATEPAGE_SUCCESS;
 }
 
-/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page.  We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
-                               int nr_pages)
-{
-       int i;
-       struct page *dst_base = dst;
-       struct page *src_base = src;
-
-       for (i = 0; i < nr_pages; ) {
-               cond_resched();
-               copy_highpage(dst, src);
-
-               i++;
-               dst = mem_map_next(dst, dst_base, i);
-               src = mem_map_next(src, src_base, i);
-       }
-}
-
-static void copy_huge_page(struct page *dst, struct page *src)
-{
-       int i;
-       int nr_pages;
-
-       if (PageHuge(src)) {
-               /* hugetlbfs page */
-               struct hstate *h = page_hstate(src);
-               nr_pages = pages_per_huge_page(h);
-
-               if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
-                       __copy_gigantic_page(dst, src, nr_pages);
-                       return;
-               }
-       } else {
-               /* thp page */
-               BUG_ON(!PageTransHuge(src));
-               nr_pages = thp_nr_pages(src);
-       }
-
-       for (i = 0; i < nr_pages; i++) {
-               cond_resched();
-               copy_highpage(dst + i, src + i);
-       }
-}
-
 /*
  * Copy the page to its new location
  */
@@ -626,7 +586,10 @@ void migrate_page_states(struct page *newpage, struct page *page)
        if (PageSwapCache(page))
                ClearPageSwapCache(page);
        ClearPagePrivate(page);
-       set_page_private(page, 0);
+
+       /* page->private contains hugetlb specific flags */
+       if (!PageHuge(page))
+               set_page_private(page, 0);
 
        /*
         * If any waiters have accumulated on the new page then
@@ -997,7 +960,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                                int force, enum migrate_mode mode)
 {
        int rc = -EAGAIN;
-       int page_was_mapped = 0;
+       bool page_was_mapped = false;
        struct anon_vma *anon_vma = NULL;
        bool is_lru = !__PageMovable(page);
 
@@ -1045,7 +1008,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
        }
 
        /*
-        * By try_to_unmap(), page->mapcount goes down to 0 here. In this case,
+        * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
         * we cannot notice that anon_vma is freed while we migrates a page.
         * This get_anon_vma() delays freeing anon_vma pointer until the end
         * of migration. File cache pages are no problem because of page_lock()
@@ -1099,8 +1062,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                /* Establish migration ptes */
                VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
                                page);
-               try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
-               page_was_mapped = 1;
+               try_to_migrate(page, 0);
+               page_was_mapped = true;
        }
 
        if (!page_mapped(page))
@@ -1137,6 +1100,80 @@ out:
        return rc;
 }
 
+
+/*
+ * node_demotion[] example:
+ *
+ * Consider a system with two sockets.  Each socket has
+ * three classes of memory attached: fast, medium and slow.
+ * Each memory class is placed in its own NUMA node.  The
+ * CPUs are placed in the node with the "fast" memory.  The
+ * 6 NUMA nodes (0-5) might be split among the sockets like
+ * this:
+ *
+ *     Socket A: 0, 1, 2
+ *     Socket B: 3, 4, 5
+ *
+ * When Node 0 fills up, its memory should be migrated to
+ * Node 1.  When Node 1 fills up, it should be migrated to
+ * Node 2.  The migration path start on the nodes with the
+ * processors (since allocations default to this node) and
+ * fast memory, progress through medium and end with the
+ * slow memory:
+ *
+ *     0 -> 1 -> 2 -> stop
+ *     3 -> 4 -> 5 -> stop
+ *
+ * This is represented in the node_demotion[] like this:
+ *
+ *     {  1, // Node 0 migrates to 1
+ *        2, // Node 1 migrates to 2
+ *       -1, // Node 2 does not migrate
+ *        4, // Node 3 migrates to 4
+ *        5, // Node 4 migrates to 5
+ *       -1} // Node 5 does not migrate
+ */
+
+/*
+ * Writes to this array occur without locking.  Cycles are
+ * not allowed: Node X demotes to Y which demotes to X...
+ *
+ * If multiple reads are performed, a single rcu_read_lock()
+ * must be held over all reads to ensure that no cycles are
+ * observed.
+ */
+static int node_demotion[MAX_NUMNODES] __read_mostly =
+       {[0 ...  MAX_NUMNODES - 1] = NUMA_NO_NODE};
+
+/**
+ * next_demotion_node() - Get the next node in the demotion path
+ * @node: The starting node to lookup the next node
+ *
+ * Return: node id for next memory node in the demotion path hierarchy
+ * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
+ * @node online or guarantee that it *continues* to be the next demotion
+ * target.
+ */
+int next_demotion_node(int node)
+{
+       int target;
+
+       /*
+        * node_demotion[] is updated without excluding this
+        * function from running.  RCU doesn't provide any
+        * compiler barriers, so the READ_ONCE() is required
+        * to avoid compiler reordering or read merging.
+        *
+        * Make sure to use RCU over entire code blocks if
+        * node_demotion[] reads need to be consistent.
+        */
+       rcu_read_lock();
+       target = READ_ONCE(node_demotion[node]);
+       rcu_read_unlock();
+
+       return target;
+}
+
 /*
  * Obtain the lock on page, remove all ptes and migrate the page
  * to the newly allocated page in newpage.
@@ -1288,7 +1325,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
         * page_mapping() set, hugetlbfs specific move page routine will not
         * be called and we could leak usage counts for subpools.
         */
-       if (page_private(hpage) && !page_mapping(hpage)) {
+       if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
                rc = -EBUSY;
                goto out_unlock;
        }
@@ -1301,7 +1338,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
 
        if (page_mapped(hpage)) {
                bool mapping_locked = false;
-               enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+               enum ttu_flags ttu = 0;
 
                if (!PageAnon(hpage)) {
                        /*
@@ -1318,7 +1355,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                        ttu |= TTU_RMAP_LOCKED;
                }
 
-               try_to_unmap(hpage, ttu);
+               try_to_migrate(hpage, ttu);
                page_was_mapped = 1;
 
                if (mapping_locked)
@@ -1392,6 +1429,8 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  * @mode:              The migration mode that specifies the constraints for
  *                     page migration, if any.
  * @reason:            The reason for page migration.
+ * @ret_succeeded:     Set to the number of pages migrated successfully if
+ *                     the caller passes a non-NULL pointer.
  *
  * The function returns after 10 attempts or if no pages are movable any more
  * because the list has become empty or no retryable pages exist any more.
@@ -1402,7 +1441,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
  */
 int migrate_pages(struct list_head *from, new_page_t get_new_page,
                free_page_t put_new_page, unsigned long private,
-               enum migrate_mode mode, int reason)
+               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
 {
        int retry = 1;
        int thp_retry = 1;
@@ -1418,6 +1457,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
        int swapwrite = current->flags & PF_SWAPWRITE;
        int rc, nr_subpages;
        LIST_HEAD(ret_pages);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
 
        trace_mm_migrate_pages_start(mode, reason);
 
@@ -1489,8 +1529,9 @@ retry:
                                /*
                                 * When memory is low, don't bother to try to migrate
                                 * other pages, just exit.
+                                * THP NUMA faulting doesn't split THP to retry.
                                 */
-                               if (is_thp) {
+                               if (is_thp && !nosplit) {
                                        if (!try_split_thp(page, &page2, from)) {
                                                nr_thp_split++;
                                                goto retry;
@@ -1555,6 +1596,9 @@ out:
        if (!swapwrite)
                current->flags &= ~PF_SWAPWRITE;
 
+       if (ret_succeeded)
+               *ret_succeeded = nr_succeeded;
+
        return rc;
 }
 
@@ -1624,7 +1668,7 @@ static int do_move_pages_to_node(struct mm_struct *mm,
        };
 
        err = migrate_pages(pagelist, alloc_migration_target, NULL,
-                       (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
+               (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL, NULL);
        if (err)
                putback_movable_pages(pagelist);
        return err;
@@ -1834,8 +1878,8 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                struct page *page;
                int err = -EFAULT;
 
-               vma = find_vma(mm, addr);
-               if (!vma || addr < vma->vm_start)
+               vma = vma_lookup(mm, addr);
+               if (!vma)
                        goto set_status;
 
                /* FOLL_DUMP to ignore special (like zero) pages */
@@ -1856,6 +1900,23 @@ set_status:
        mmap_read_unlock(mm);
 }
 
+static int get_compat_pages_array(const void __user *chunk_pages[],
+                                 const void __user * __user *pages,
+                                 unsigned long chunk_nr)
+{
+       compat_uptr_t __user *pages32 = (compat_uptr_t __user *)pages;
+       compat_uptr_t p;
+       int i;
+
+       for (i = 0; i < chunk_nr; i++) {
+               if (get_user(p, pages32 + i))
+                       return -EFAULT;
+               chunk_pages[i] = compat_ptr(p);
+       }
+
+       return 0;
+}
+
 /*
  * Determine the nodes of a user array of pages and store it in
  * a user array of status.
@@ -1875,8 +1936,15 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
                if (chunk_nr > DO_PAGES_STAT_CHUNK_NR)
                        chunk_nr = DO_PAGES_STAT_CHUNK_NR;
 
-               if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages)))
-                       break;
+               if (in_compat_syscall()) {
+                       if (get_compat_pages_array(chunk_pages, pages,
+                                                  chunk_nr))
+                               break;
+               } else {
+                       if (copy_from_user(chunk_pages, pages,
+                                     chunk_nr * sizeof(*chunk_pages)))
+                               break;
+               }
 
                do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status);
 
@@ -1979,28 +2047,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
        return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
 }
 
-#ifdef CONFIG_COMPAT
-COMPAT_SYSCALL_DEFINE6(move_pages, pid_t, pid, compat_ulong_t, nr_pages,
-                      compat_uptr_t __user *, pages32,
-                      const int __user *, nodes,
-                      int __user *, status,
-                      int, flags)
-{
-       const void __user * __user *pages;
-       int i;
-
-       pages = compat_alloc_user_space(nr_pages * sizeof(void *));
-       for (i = 0; i < nr_pages; i++) {
-               compat_uptr_t p;
-
-               if (get_user(p, pages32 + i) ||
-                       put_user(compat_ptr(p), pages + i))
-                       return -EFAULT;
-       }
-       return kernel_move_pages(pid, nr_pages, pages, nodes, status, flags);
-}
-#endif /* CONFIG_COMPAT */
-
 #ifdef CONFIG_NUMA_BALANCING
 /*
  * Returns true if this is a safe migration target node for misplaced NUMA
@@ -2043,34 +2089,44 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
        return newpage;
 }
 
+static struct page *alloc_misplaced_dst_page_thp(struct page *page,
+                                                unsigned long data)
+{
+       int nid = (int) data;
+       struct page *newpage;
+
+       newpage = alloc_pages_node(nid, (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
+                                  HPAGE_PMD_ORDER);
+       if (!newpage)
+               goto out;
+
+       prep_transhuge_page(newpage);
+
+out:
+       return newpage;
+}
+
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
        int page_lru;
+       int nr_pages = thp_nr_pages(page);
 
        VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
 
-       /* Avoid migrating to a node that is nearly full */
-       if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
+       /* Do not migrate THP mapped by multiple processes */
+       if (PageTransHuge(page) && total_mapcount(page) > 1)
                return 0;
 
-       if (isolate_lru_page(page))
+       /* Avoid migrating to a node that is nearly full */
+       if (!migrate_balanced_pgdat(pgdat, nr_pages))
                return 0;
 
-       /*
-        * migrate_misplaced_transhuge_page() skips page migration's usual
-        * check on page_count(), so we must do it here, now that the page
-        * has been isolated: a GUP pin, or any other pin, prevents migration.
-        * The expected page count is 3: 1 for page's mapcount and 1 for the
-        * caller's pin and 1 for the reference taken by isolate_lru_page().
-        */
-       if (PageTransHuge(page) && page_count(page) != 3) {
-               putback_lru_page(page);
+       if (isolate_lru_page(page))
                return 0;
-       }
 
        page_lru = page_is_file_lru(page);
        mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
-                               thp_nr_pages(page));
+                           nr_pages);
 
        /*
         * Isolating the page has taken another reference, so the
@@ -2081,12 +2137,6 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
        return 1;
 }
 
-bool pmd_trans_migrating(pmd_t pmd)
-{
-       struct page *page = pmd_page(pmd);
-       return PageLocked(page);
-}
-
 /*
  * Attempt to migrate a misplaced page to the specified destination
  * node. Caller is expected to have an elevated reference count on
@@ -2099,6 +2149,21 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
        int isolated;
        int nr_remaining;
        LIST_HEAD(migratepages);
+       new_page_t *new;
+       bool compound;
+       int nr_pages = thp_nr_pages(page);
+
+       /*
+        * PTE mapped THP or HugeTLB page can't reach here so the page could
+        * be either base page or THP.  And it must be head page if it is
+        * THP.
+        */
+       compound = PageTransHuge(page);
+
+       if (compound)
+               new = alloc_misplaced_dst_page_thp;
+       else
+               new = alloc_misplaced_dst_page;
 
        /*
         * Don't migrate file pages that are mapped in multiple processes
@@ -2120,19 +2185,18 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
                goto out;
 
        list_add(&page->lru, &migratepages);
-       nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
-                                    NULL, node, MIGRATE_ASYNC,
-                                    MR_NUMA_MISPLACED);
+       nr_remaining = migrate_pages(&migratepages, *new, NULL, node,
+                                    MIGRATE_ASYNC, MR_NUMA_MISPLACED, NULL);
        if (nr_remaining) {
                if (!list_empty(&migratepages)) {
                        list_del(&page->lru);
-                       dec_node_page_state(page, NR_ISOLATED_ANON +
-                                       page_is_file_lru(page));
+                       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
+                                       page_is_file_lru(page), -nr_pages);
                        putback_lru_page(page);
                }
                isolated = 0;
        } else
-               count_vm_numa_event(NUMA_PAGE_MIGRATE);
+               count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_pages);
        BUG_ON(!list_empty(&migratepages));
        return isolated;
 
@@ -2141,141 +2205,6 @@ out:
        return 0;
 }
 #endif /* CONFIG_NUMA_BALANCING */
-
-#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * Migrates a THP to a given target node. page must be locked and is unlocked
- * before returning.
- */
-int migrate_misplaced_transhuge_page(struct mm_struct *mm,
-                               struct vm_area_struct *vma,
-                               pmd_t *pmd, pmd_t entry,
-                               unsigned long address,
-                               struct page *page, int node)
-{
-       spinlock_t *ptl;
-       pg_data_t *pgdat = NODE_DATA(node);
-       int isolated = 0;
-       struct page *new_page = NULL;
-       int page_lru = page_is_file_lru(page);
-       unsigned long start = address & HPAGE_PMD_MASK;
-
-       new_page = alloc_pages_node(node,
-               (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
-               HPAGE_PMD_ORDER);
-       if (!new_page)
-               goto out_fail;
-       prep_transhuge_page(new_page);
-
-       isolated = numamigrate_isolate_page(pgdat, page);
-       if (!isolated) {
-               put_page(new_page);
-               goto out_fail;
-       }
-
-       /* Prepare a page as a migration target */
-       __SetPageLocked(new_page);
-       if (PageSwapBacked(page))
-               __SetPageSwapBacked(new_page);
-
-       /* anon mapping, we can simply copy page->mapping to the new page: */
-       new_page->mapping = page->mapping;
-       new_page->index = page->index;
-       /* flush the cache before copying using the kernel virtual address */
-       flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
-       migrate_page_copy(new_page, page);
-       WARN_ON(PageLRU(new_page));
-
-       /* Recheck the target PMD */
-       ptl = pmd_lock(mm, pmd);
-       if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
-               spin_unlock(ptl);
-
-               /* Reverse changes made by migrate_page_copy() */
-               if (TestClearPageActive(new_page))
-                       SetPageActive(page);
-               if (TestClearPageUnevictable(new_page))
-                       SetPageUnevictable(page);
-
-               unlock_page(new_page);
-               put_page(new_page);             /* Free it */
-
-               /* Retake the callers reference and putback on LRU */
-               get_page(page);
-               putback_lru_page(page);
-               mod_node_page_state(page_pgdat(page),
-                        NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR);
-
-               goto out_unlock;
-       }
-
-       entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-       entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-
-       /*
-        * Overwrite the old entry under pagetable lock and establish
-        * the new PTE. Any parallel GUP will either observe the old
-        * page blocking on the page lock, block on the page table
-        * lock or observe the new page. The SetPageUptodate on the
-        * new page and page_add_new_anon_rmap guarantee the copy is
-        * visible before the pagetable update.
-        */
-       page_add_anon_rmap(new_page, vma, start, true);
-       /*
-        * At this point the pmd is numa/protnone (i.e. non present) and the TLB
-        * has already been flushed globally.  So no TLB can be currently
-        * caching this non present pmd mapping.  There's no need to clear the
-        * pmd before doing set_pmd_at(), nor to flush the TLB after
-        * set_pmd_at().  Clearing the pmd here would introduce a race
-        * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
-        * mmap_lock for reading.  If the pmd is set to NULL at any given time,
-        * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
-        * pmd.
-        */
-       set_pmd_at(mm, start, pmd, entry);
-       update_mmu_cache_pmd(vma, address, &entry);
-
-       page_ref_unfreeze(page, 2);
-       mlock_migrate_page(new_page, page);
-       page_remove_rmap(page, true);
-       set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
-
-       spin_unlock(ptl);
-
-       /* Take an "isolate" reference and put new page on the LRU. */
-       get_page(new_page);
-       putback_lru_page(new_page);
-
-       unlock_page(new_page);
-       unlock_page(page);
-       put_page(page);                 /* Drop the rmap reference */
-       put_page(page);                 /* Drop the LRU isolation reference */
-
-       count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR);
-       count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR);
-
-       mod_node_page_state(page_pgdat(page),
-                       NR_ISOLATED_ANON + page_lru,
-                       -HPAGE_PMD_NR);
-       return isolated;
-
-out_fail:
-       count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-       ptl = pmd_lock(mm, pmd);
-       if (pmd_same(*pmd, entry)) {
-               entry = pmd_modify(entry, vma->vm_page_prot);
-               set_pmd_at(mm, start, pmd, entry);
-               update_mmu_cache_pmd(vma, address, &entry);
-       }
-       spin_unlock(ptl);
-
-out_unlock:
-       unlock_page(page);
-       put_page(page);
-       return 0;
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
 #endif /* CONFIG_NUMA */
 
 #ifdef CONFIG_DEVICE_PRIVATE
@@ -2400,7 +2329,7 @@ again:
                        if (!is_device_private_entry(entry))
                                goto next;
 
-                       page = device_private_entry_to_page(entry);
+                       page = pfn_swap_entry_to_page(entry);
                        if (!(migrate->flags &
                                MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
                            page->pgmap->owner != migrate->pgmap_owner)
@@ -2408,7 +2337,7 @@ again:
 
                        mpfn = migrate_pfn(page_to_pfn(page)) |
                                        MIGRATE_PFN_MIGRATE;
-                       if (is_write_device_private_entry(entry))
+                       if (is_writable_device_private_entry(entry))
                                mpfn |= MIGRATE_PFN_WRITE;
                } else {
                        if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
@@ -2454,8 +2383,12 @@ again:
                        ptep_get_and_clear(mm, addr, ptep);
 
                        /* Setup special migration page table entry */
-                       entry = make_migration_entry(page, mpfn &
-                                                    MIGRATE_PFN_WRITE);
+                       if (mpfn & MIGRATE_PFN_WRITE)
+                               entry = make_writable_migration_entry(
+                                                       page_to_pfn(page));
+                       else
+                               entry = make_readable_migration_entry(
+                                                       page_to_pfn(page));
                        swp_pte = swp_entry_to_pte(entry);
                        if (pte_present(pte)) {
                                if (pte_soft_dirty(pte))
@@ -2518,8 +2451,8 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
         * that the registered device driver can skip invalidating device
         * private page mappings that won't be migrated.
         */
-       mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
-               migrate->vma->vm_mm, migrate->start, migrate->end,
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_MIGRATE, 0,
+               migrate->vma, migrate->vma->vm_mm, migrate->start, migrate->end,
                migrate->pgmap_owner);
        mmu_notifier_invalidate_range_start(&range);
 
@@ -2704,7 +2637,6 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
  */
 static void migrate_vma_unmap(struct migrate_vma *migrate)
 {
-       int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
        const unsigned long npages = migrate->npages;
        const unsigned long start = migrate->start;
        unsigned long addr, i, restore = 0;
@@ -2716,7 +2648,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
                        continue;
 
                if (page_mapped(page)) {
-                       try_to_unmap(page, flags);
+                       try_to_migrate(page, 0);
                        if (page_mapped(page))
                                goto restore;
                }
@@ -2928,7 +2860,12 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
                if (is_device_private_page(page)) {
                        swp_entry_t swp_entry;
 
-                       swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
+                       if (vma->vm_flags & VM_WRITE)
+                               swp_entry = make_writable_device_private_entry(
+                                                       page_to_pfn(page));
+                       else
+                               swp_entry = make_readable_device_private_entry(
+                                                       page_to_pfn(page));
                        entry = swp_entry_to_pte(swp_entry);
                } else {
                        /*
@@ -3025,9 +2962,9 @@ void migrate_vma_pages(struct migrate_vma *migrate)
                        if (!notified) {
                                notified = true;
 
-                               mmu_notifier_range_init_migrate(&range, 0,
-                                       migrate->vma, migrate->vma->vm_mm,
-                                       addr, migrate->end,
+                               mmu_notifier_range_init_owner(&range,
+                                       MMU_NOTIFY_MIGRATE, 0, migrate->vma,
+                                       migrate->vma->vm_mm, addr, migrate->end,
                                        migrate->pgmap_owner);
                                mmu_notifier_invalidate_range_start(&range);
                        }
@@ -3128,3 +3065,232 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
 }
 EXPORT_SYMBOL(migrate_vma_finalize);
 #endif /* CONFIG_DEVICE_PRIVATE */
+
+#if defined(CONFIG_MEMORY_HOTPLUG)
+/* Disable reclaim-based migration. */
+static void __disable_all_migrate_targets(void)
+{
+       int node;
+
+       for_each_online_node(node)
+               node_demotion[node] = NUMA_NO_NODE;
+}
+
+static void disable_all_migrate_targets(void)
+{
+       __disable_all_migrate_targets();
+
+       /*
+        * Ensure that the "disable" is visible across the system.
+        * Readers will see either a combination of before+disable
+        * state or disable+after.  They will never see before and
+        * after state together.
+        *
+        * The before+after state together might have cycles and
+        * could cause readers to do things like loop until this
+        * function finishes.  This ensures they can only see a
+        * single "bad" read and would, for instance, only loop
+        * once.
+        */
+       synchronize_rcu();
+}
+
+/*
+ * Find an automatic demotion target for 'node'.
+ * Failing here is OK.  It might just indicate
+ * being at the end of a chain.
+ */
+static int establish_migrate_target(int node, nodemask_t *used)
+{
+       int migration_target;
+
+       /*
+        * Can not set a migration target on a
+        * node with it already set.
+        *
+        * No need for READ_ONCE() here since this
+        * in the write path for node_demotion[].
+        * This should be the only thread writing.
+        */
+       if (node_demotion[node] != NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       migration_target = find_next_best_node(node, used);
+       if (migration_target == NUMA_NO_NODE)
+               return NUMA_NO_NODE;
+
+       node_demotion[node] = migration_target;
+
+       return migration_target;
+}
+
+/*
+ * When memory fills up on a node, memory contents can be
+ * automatically migrated to another node instead of
+ * discarded at reclaim.
+ *
+ * Establish a "migration path" which will start at nodes
+ * with CPUs and will follow the priorities used to build the
+ * page allocator zonelists.
+ *
+ * The difference here is that cycles must be avoided.  If
+ * node0 migrates to node1, then neither node1, nor anything
+ * node1 migrates to can migrate to node0.
+ *
+ * This function can run simultaneously with readers of
+ * node_demotion[].  However, it can not run simultaneously
+ * with itself.  Exclusion is provided by memory hotplug events
+ * being single-threaded.
+ */
+static void __set_migration_target_nodes(void)
+{
+       nodemask_t next_pass    = NODE_MASK_NONE;
+       nodemask_t this_pass    = NODE_MASK_NONE;
+       nodemask_t used_targets = NODE_MASK_NONE;
+       int node;
+
+       /*
+        * Avoid any oddities like cycles that could occur
+        * from changes in the topology.  This will leave
+        * a momentary gap when migration is disabled.
+        */
+       disable_all_migrate_targets();
+
+       /*
+        * Allocations go close to CPUs, first.  Assume that
+        * the migration path starts at the nodes with CPUs.
+        */
+       next_pass = node_states[N_CPU];
+again:
+       this_pass = next_pass;
+       next_pass = NODE_MASK_NONE;
+       /*
+        * To avoid cycles in the migration "graph", ensure
+        * that migration sources are not future targets by
+        * setting them in 'used_targets'.  Do this only
+        * once per pass so that multiple source nodes can
+        * share a target node.
+        *
+        * 'used_targets' will become unavailable in future
+        * passes.  This limits some opportunities for
+        * multiple source nodes to share a destination.
+        */
+       nodes_or(used_targets, used_targets, this_pass);
+       for_each_node_mask(node, this_pass) {
+               int target_node = establish_migrate_target(node, &used_targets);
+
+               if (target_node == NUMA_NO_NODE)
+                       continue;
+
+               /*
+                * Visit targets from this pass in the next pass.
+                * Eventually, every node will have been part of
+                * a pass, and will become set in 'used_targets'.
+                */
+               node_set(target_node, next_pass);
+       }
+       /*
+        * 'next_pass' contains nodes which became migration
+        * targets in this pass.  Make additional passes until
+        * no more migrations targets are available.
+        */
+       if (!nodes_empty(next_pass))
+               goto again;
+}
+
+/*
+ * For callers that do not hold get_online_mems() already.
+ */
+static void set_migration_target_nodes(void)
+{
+       get_online_mems();
+       __set_migration_target_nodes();
+       put_online_mems();
+}
+
+/*
+ * React to hotplug events that might affect the migration targets
+ * like events that online or offline NUMA nodes.
+ *
+ * The ordering is also currently dependent on which nodes have
+ * CPUs.  That means we need CPU on/offline notification too.
+ */
+static int migration_online_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+static int migration_offline_cpu(unsigned int cpu)
+{
+       set_migration_target_nodes();
+       return 0;
+}
+
+/*
+ * This leaves migrate-on-reclaim transiently disabled between
+ * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
+ * whether reclaim-based migration is enabled or not, which
+ * ensures that the user can turn reclaim-based migration at
+ * any time without needing to recalculate migration targets.
+ *
+ * These callbacks already hold get_online_mems().  That is why
+ * __set_migration_target_nodes() can be used as opposed to
+ * set_migration_target_nodes().
+ */
+static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
+                                                unsigned long action, void *arg)
+{
+       switch (action) {
+       case MEM_GOING_OFFLINE:
+               /*
+                * Make sure there are not transient states where
+                * an offline node is a migration target.  This
+                * will leave migration disabled until the offline
+                * completes and the MEM_OFFLINE case below runs.
+                */
+               disable_all_migrate_targets();
+               break;
+       case MEM_OFFLINE:
+       case MEM_ONLINE:
+               /*
+                * Recalculate the target nodes once the node
+                * reaches its final state (online or offline).
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_CANCEL_OFFLINE:
+               /*
+                * MEM_GOING_OFFLINE disabled all the migration
+                * targets.  Reenable them.
+                */
+               __set_migration_target_nodes();
+               break;
+       case MEM_GOING_ONLINE:
+       case MEM_CANCEL_ONLINE:
+               break;
+       }
+
+       return notifier_from_errno(0);
+}
+
+static int __init migrate_on_reclaim_init(void)
+{
+       int ret;
+
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "migrate on reclaim",
+                               migration_online_cpu,
+                               migration_offline_cpu);
+       /*
+        * In the unlikely case that this fails, the automatic
+        * migration targets may become suboptimal for nodes
+        * where N_CPU changes.  With such a small impact in a
+        * rare case, do not bother trying to do anything special.
+        */
+       WARN_ON(ret < 0);
+
+       hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
+       return 0;
+}
+late_initcall(migrate_on_reclaim_init);
+#endif /* CONFIG_MEMORY_HOTPLUG */