Linux 6.9-rc1

[linux-2.6-microblaze.git] / mm / migrate.c
diff --git a/mm/migrate.c b/mm/migrate.c

index 6a1597c..73a052a 100644 (file)
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -21,7 +21,6 @@
  #include <linux/buffer_head.h>
  #include <linux/mm_inline.h>
  #include <linux/nsproxy.h>
-#include <linux/pagevec.h>
  #include <linux/ksm.h>
  #include <linux/rmap.h>
  #include <linux/topology.h>
@@ -50,6 +49,7 @@
  #include <linux/memory.h>
  #include <linux/random.h>
  #include <linux/sched/sysctl.h>
+#include <linux/memory-tiers.h>
  
  #include <asm/tlbflush.h>
  
@@ -57,8 +57,9 @@
  
  #include "internal.h"
  
-int isolate_movable_page(struct page *page, isolate_mode_t mode)
+bool isolate_movable_page(struct page *page, isolate_mode_t mode)
  {
+       struct folio *folio = folio_get_nontail_page(page);
         const struct movable_operations *mops;
  
         /*
@@ -70,16 +71,25 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
          * the put_page() at the end of this block will take care of
          * release this page, thus avoiding a nasty leakage.
          */
-       if (unlikely(!get_page_unless_zero(page)))
+       if (!folio)
                 goto out;
  
+       if (unlikely(folio_test_slab(folio)))
+               goto out_putfolio;
+       /* Pairs with smp_wmb() in slab freeing, e.g. SLUB's __free_slab() */
+       smp_rmb();
         /*
-        * Check PageMovable before holding a PG_lock because page's owner
-        * assumes anybody doesn't touch PG_lock of newly allocated page
-        * so unconditionally grabbing the lock ruins page's owner side.
+        * Check movable flag before taking the page lock because
+        * we use non-atomic bitops on newly allocated page flags so
+        * unconditionally grabbing the lock ruins page's owner side.
          */
-       if (unlikely(!__PageMovable(page)))
-               goto out_putpage;
+       if (unlikely(!__folio_test_movable(folio)))
+               goto out_putfolio;
+       /* Pairs with smp_wmb() in slab allocation, e.g. SLUB's alloc_slab_page() */
+       smp_rmb();
+       if (unlikely(folio_test_slab(folio)))
+               goto out_putfolio;
+
         /*
          * As movable pages are not isolated from LRU lists, concurrent
          * compaction threads can race against page migration functions
@@ -91,39 +101,39 @@ int isolate_movable_page(struct page *page, isolate_mode_t mode)
          * lets be sure we have the page lock
          * before proceeding with the movable page isolation steps.
          */
-       if (unlikely(!trylock_page(page)))
-               goto out_putpage;
+       if (unlikely(!folio_trylock(folio)))
+               goto out_putfolio;
  
-       if (!PageMovable(page) || PageIsolated(page))
+       if (!folio_test_movable(folio) || folio_test_isolated(folio))
                 goto out_no_isolated;
  
-       mops = page_movable_ops(page);
-       VM_BUG_ON_PAGE(!mops, page);
+       mops = folio_movable_ops(folio);
+       VM_BUG_ON_FOLIO(!mops, folio);
  
-       if (!mops->isolate_page(page, mode))
+       if (!mops->isolate_page(&folio->page, mode))
                 goto out_no_isolated;
  
         /* Driver shouldn't use PG_isolated bit of page->flags */
-       WARN_ON_ONCE(PageIsolated(page));
-       SetPageIsolated(page);
-       unlock_page(page);
+       WARN_ON_ONCE(folio_test_isolated(folio));
+       folio_set_isolated(folio);
+       folio_unlock(folio);
  
-       return 0;
+       return true;
  
  out_no_isolated:
-       unlock_page(page);
-out_putpage:
-       put_page(page);
+       folio_unlock(folio);
+out_putfolio:
+       folio_put(folio);
  out:
-       return -EBUSY;
+       return false;
  }
  
-static void putback_movable_page(struct page *page)
+static void putback_movable_folio(struct folio *folio)
  {
-       const struct movable_operations *mops = page_movable_ops(page);
+       const struct movable_operations *mops = folio_movable_ops(folio);
  
-       mops->putback_page(page);
-       ClearPageIsolated(page);
+       mops->putback_page(&folio->page);
+       folio_clear_isolated(folio);
  }
  
  /*
@@ -136,33 +146,33 @@ static void putback_movable_page(struct page *page)
   */
  void putback_movable_pages(struct list_head *l)
  {
-       struct page *page;
-       struct page *page2;
+       struct folio *folio;
+       struct folio *folio2;
  
-       list_for_each_entry_safe(page, page2, l, lru) {
-               if (unlikely(PageHuge(page))) {
-                       putback_active_hugepage(page);
+       list_for_each_entry_safe(folio, folio2, l, lru) {
+               if (unlikely(folio_test_hugetlb(folio))) {
+                       folio_putback_active_hugetlb(folio);
                         continue;
                 }
-               list_del(&page->lru);
+               list_del(&folio->lru);
                 /*
-                * We isolated non-lru movable page so here we can use
-                * __PageMovable because LRU page's mapping cannot have
-                * PAGE_MAPPING_MOVABLE.
+                * We isolated non-lru movable folio so here we can use
+                * __folio_test_movable because LRU folio's mapping cannot
+                * have PAGE_MAPPING_MOVABLE.
                  */
-               if (unlikely(__PageMovable(page))) {
-                       VM_BUG_ON_PAGE(!PageIsolated(page), page);
-                       lock_page(page);
-                       if (PageMovable(page))
-                               putback_movable_page(page);
+               if (unlikely(__folio_test_movable(folio))) {
+                       VM_BUG_ON_FOLIO(!folio_test_isolated(folio), folio);
+                       folio_lock(folio);
+                       if (folio_test_movable(folio))
+                               putback_movable_folio(folio);
                         else
-                               ClearPageIsolated(page);
-                       unlock_page(page);
-                       put_page(page);
+                               folio_clear_isolated(folio);
+                       folio_unlock(folio);
+                       folio_put(folio);
                 } else {
-                       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-                                       page_is_file_lru(page), -thp_nr_pages(page));
-                       putback_lru_page(page);
+                       node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+                                       folio_is_file_lru(folio), -folio_nr_pages(folio));
+                       folio_putback_lru(folio);
                 }
         }
  }
@@ -177,6 +187,7 @@ static bool remove_migration_pte(struct folio *folio,
  
         while (page_vma_mapped_walk(&pvmw)) {
                 rmap_t rmap_flags = RMAP_NONE;
+               pte_t old_pte;
                 pte_t pte;
                 swp_entry_t entry;
                 struct page *new;
@@ -198,17 +209,22 @@ static bool remove_migration_pte(struct folio *folio,
  #endif
  
                 folio_get(folio);
-               pte = pte_mkold(mk_pte(new, READ_ONCE(vma->vm_page_prot)));
-               if (pte_swp_soft_dirty(*pvmw.pte))
+               pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
+               old_pte = ptep_get(pvmw.pte);
+
+               entry = pte_to_swp_entry(old_pte);
+               if (!is_migration_entry_young(entry))
+                       pte = pte_mkold(pte);
+               if (folio_test_dirty(folio) && is_migration_entry_dirty(entry))
+                       pte = pte_mkdirty(pte);
+               if (pte_swp_soft_dirty(old_pte))
                         pte = pte_mksoft_dirty(pte);
+               else
+                       pte = pte_clear_soft_dirty(pte);
  
-               /*
-                * Recheck VMA as permissions can change since migration started
-                */
-               entry = pte_to_swp_entry(*pvmw.pte);
                 if (is_writable_migration_entry(entry))
-                       pte = maybe_mkwrite(pte, vma);
-               else if (pte_swp_uffd_wp(*pvmw.pte))
+                       pte = pte_mkwrite(pte, vma);
+               else if (pte_swp_uffd_wp(old_pte))
                         pte = pte_mkuffd_wp(pte);
  
                 if (folio_test_anon(folio) && !is_readable_migration_entry(entry))
@@ -222,36 +238,38 @@ static bool remove_migration_pte(struct folio *folio,
                                 entry = make_readable_device_private_entry(
                                                         page_to_pfn(new));
                         pte = swp_entry_to_pte(entry);
-                       if (pte_swp_soft_dirty(*pvmw.pte))
+                       if (pte_swp_soft_dirty(old_pte))
                                 pte = pte_swp_mksoft_dirty(pte);
-                       if (pte_swp_uffd_wp(*pvmw.pte))
+                       if (pte_swp_uffd_wp(old_pte))
                                 pte = pte_swp_mkuffd_wp(pte);
                 }
  
  #ifdef CONFIG_HUGETLB_PAGE
                 if (folio_test_hugetlb(folio)) {
-                       unsigned int shift = huge_page_shift(hstate_vma(vma));
+                       struct hstate *h = hstate_vma(vma);
+                       unsigned int shift = huge_page_shift(h);
+                       unsigned long psize = huge_page_size(h);
  
-                       pte = pte_mkhuge(pte);
                         pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                         if (folio_test_anon(folio))
-                               hugepage_add_anon_rmap(new, vma, pvmw.address,
-                                                      rmap_flags);
+                               hugetlb_add_anon_rmap(folio, vma, pvmw.address,
+                                                     rmap_flags);
                         else
-                               page_dup_file_rmap(new, true);
-                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
+                               hugetlb_add_file_rmap(folio);
+                       set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte,
+                                       psize);
                 } else
  #endif
                 {
                         if (folio_test_anon(folio))
-                               page_add_anon_rmap(new, vma, pvmw.address,
-                                                  rmap_flags);
+                               folio_add_anon_rmap_pte(folio, new, vma,
+                                                       pvmw.address, rmap_flags);
                         else
-                               page_add_file_rmap(new, vma, false);
+                               folio_add_file_rmap_pte(folio, new, vma);
                         set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
                 }
                 if (vma->vm_flags & VM_LOCKED)
-                       mlock_page_drain_local();
+                       mlock_drain_local();
  
                 trace_remove_migration_pte(pvmw.address, pte_val(pte),
                                            compound_order(new));
@@ -285,14 +303,21 @@ void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
   * get to the page and wait until migration is finished.
   * When we return from this function the fault will be retried.
   */
-void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
-                               spinlock_t *ptl)
+void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
+                         unsigned long address)
  {
+       spinlock_t *ptl;
+       pte_t *ptep;
         pte_t pte;
         swp_entry_t entry;
  
-       spin_lock(ptl);
-       pte = *ptep;
+       ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
+       if (!ptep)
+               return;
+
+       pte = ptep_get(ptep);
+       pte_unmap(ptep);
+
         if (!is_swap_pte(pte))
                 goto out;
  
@@ -300,39 +325,41 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
         if (!is_migration_entry(entry))
                 goto out;
  
-       migration_entry_wait_on_locked(entry, ptep, ptl);
+       migration_entry_wait_on_locked(entry, ptl);
         return;
  out:
-       pte_unmap_unlock(ptep, ptl);
-}
-
-void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
-                               unsigned long address)
-{
-       spinlock_t *ptl = pte_lockptr(mm, pmd);
-       pte_t *ptep = pte_offset_map(pmd, address);
-       __migration_entry_wait(mm, ptep, ptl);
+       spin_unlock(ptl);
  }
  
  #ifdef CONFIG_HUGETLB_PAGE
-void __migration_entry_wait_huge(pte_t *ptep, spinlock_t *ptl)
+/*
+ * The vma read lock must be held upon entry. Holding that lock prevents either
+ * the pte or the ptl from being freed.
+ *
+ * This function will release the vma lock before returning.
+ */
+void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *ptep)
  {
+       spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, ptep);
         pte_t pte;
  
+       hugetlb_vma_assert_locked(vma);
         spin_lock(ptl);
         pte = huge_ptep_get(ptep);
  
-       if (unlikely(!is_hugetlb_entry_migration(pte)))
+       if (unlikely(!is_hugetlb_entry_migration(pte))) {
                 spin_unlock(ptl);
-       else
-               migration_entry_wait_on_locked(pte_to_swp_entry(pte), NULL, ptl);
-}
-
-void migration_entry_wait_huge(struct vm_area_struct *vma, pte_t *pte)
-{
-       spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), vma->vm_mm, pte);
-
-       __migration_entry_wait_huge(pte, ptl);
+               hugetlb_vma_unlock_read(vma);
+       } else {
+               /*
+                * If migration entry existed, safe to release vma lock
+                * here because the pgtable page won't be freed without the
+                * pgtable lock released.  See comment right above pgtable
+                * lock release in migration_entry_wait_on_locked().
+                */
+               hugetlb_vma_unlock_read(vma);
+               migration_entry_wait_on_locked(pte_to_swp_entry(pte), ptl);
+       }
  }
  #endif
  
@@ -344,7 +371,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
         ptl = pmd_lock(mm, pmd);
         if (!is_pmd_migration_entry(*pmd))
                 goto unlock;
-       migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), NULL, ptl);
+       migration_entry_wait_on_locked(pmd_to_swp_entry(*pmd), ptl);
         return;
  unlock:
         spin_unlock(ptl);
@@ -381,6 +408,7 @@ int folio_migrate_mapping(struct address_space *mapping,
         int dirty;
         int expected_count = folio_expected_refs(mapping, folio) + extra_count;
         long nr = folio_nr_pages(folio);
+       long entries, i;
  
         if (!mapping) {
                 /* Anonymous page without mapping */
@@ -418,8 +446,10 @@ int folio_migrate_mapping(struct address_space *mapping,
                         folio_set_swapcache(newfolio);
                         newfolio->private = folio_get_private(folio);
                 }
+               entries = nr;
         } else {
                 VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio);
+               entries = 1;
         }
  
         /* Move dirty while page refs frozen and newpage not yet exposed */
@@ -429,7 +459,11 @@ int folio_migrate_mapping(struct address_space *mapping,
                 folio_set_dirty(newfolio);
         }
  
-       xas_store(&xas, newfolio);
+       /* Swap cache still stores N entries instead of a high-order entry */
+       for (i = 0; i < entries; i++) {
+               xas_store(&xas, newfolio);
+               xas_next(&xas);
+       }
  
         /*
          * Drop cache reference from old page by unfreezing
@@ -464,6 +498,11 @@ int folio_migrate_mapping(struct address_space *mapping,
                 if (folio_test_swapbacked(folio) && !folio_test_swapcache(folio)) {
                         __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
                         __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
+
+                       if (folio_test_pmd_mappable(folio)) {
+                               __mod_lruvec_state(old_lruvec, NR_SHMEM_THPS, -nr);
+                               __mod_lruvec_state(new_lruvec, NR_SHMEM_THPS, nr);
+                       }
                 }
  #ifdef CONFIG_SWAP
                 if (folio_test_swapcache(folio)) {
@@ -495,7 +534,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
         int expected_count;
  
         xas_lock_irq(&xas);
-       expected_count = 2 + folio_has_private(src);
+       expected_count = folio_expected_refs(mapping, src);
         if (!folio_ref_freeze(src, expected_count)) {
                 xas_unlock_irq(&xas);
                 return -EAGAIN;
@@ -504,11 +543,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
         dst->index = src->index;
         dst->mapping = src->mapping;
  
-       folio_get(dst);
+       folio_ref_add(dst, folio_nr_pages(dst));
  
         xas_store(&xas, dst);
  
-       folio_ref_unfreeze(src, expected_count - 1);
+       folio_ref_unfreeze(src, expected_count - folio_nr_pages(src));
  
         xas_unlock_irq(&xas);
  
@@ -559,8 +598,20 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
          * Copy NUMA information to the new page, to prevent over-eager
          * future migrations of this same page.
          */
-       cpupid = page_cpupid_xchg_last(&folio->page, -1);
-       page_cpupid_xchg_last(&newfolio->page, cpupid);
+       cpupid = folio_xchg_last_cpupid(folio, -1);
+       /*
+        * For memory tiering mode, when migrate between slow and fast
+        * memory node, reset cpupid, because that is used to record
+        * page access time in slow memory node.
+        */
+       if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) {
+               bool f_toptier = node_is_toptier(folio_nid(folio));
+               bool t_toptier = node_is_toptier(folio_nid(newfolio));
+
+               if (f_toptier != t_toptier)
+                       cpupid = -1;
+       }
+       folio_xchg_last_cpupid(newfolio, cpupid);
  
         folio_migrate_ksm(newfolio, folio);
         /*
@@ -592,8 +643,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
  
         folio_copy_owner(newfolio, folio);
  
-       if (!folio_test_hugetlb(folio))
-               mem_cgroup_migrate(folio, newfolio);
+       mem_cgroup_migrate(folio, newfolio);
  }
  EXPORT_SYMBOL(folio_migrate_flags);
  
@@ -608,6 +658,25 @@ EXPORT_SYMBOL(folio_migrate_copy);
   *                    Migration functions
   ***********************************************************/
  
+int migrate_folio_extra(struct address_space *mapping, struct folio *dst,
+               struct folio *src, enum migrate_mode mode, int extra_count)
+{
+       int rc;
+
+       BUG_ON(folio_test_writeback(src));      /* Writeback must be complete */
+
+       rc = folio_migrate_mapping(mapping, dst, src, extra_count);
+
+       if (rc != MIGRATEPAGE_SUCCESS)
+               return rc;
+
+       if (mode != MIGRATE_SYNC_NO_COPY)
+               folio_migrate_copy(dst, src);
+       else
+               folio_migrate_flags(dst, src);
+       return MIGRATEPAGE_SUCCESS;
+}
+
  /**
   * migrate_folio() - Simple folio migration.
   * @mapping: The address_space containing the folio.
@@ -623,60 +692,42 @@ EXPORT_SYMBOL(folio_migrate_copy);
  int migrate_folio(struct address_space *mapping, struct folio *dst,
                 struct folio *src, enum migrate_mode mode)
  {
-       int rc;
-
-       BUG_ON(folio_test_writeback(src));      /* Writeback must be complete */
-
-       rc = folio_migrate_mapping(mapping, dst, src, 0);
-
-       if (rc != MIGRATEPAGE_SUCCESS)
-               return rc;
-
-       if (mode != MIGRATE_SYNC_NO_COPY)
-               folio_migrate_copy(dst, src);
-       else
-               folio_migrate_flags(dst, src);
-       return MIGRATEPAGE_SUCCESS;
+       return migrate_folio_extra(mapping, dst, src, mode, 0);
  }
  EXPORT_SYMBOL(migrate_folio);
  
-#ifdef CONFIG_BLOCK
+#ifdef CONFIG_BUFFER_HEAD
  /* Returns true if all buffers are successfully locked */
  static bool buffer_migrate_lock_buffers(struct buffer_head *head,
                                                         enum migrate_mode mode)
  {
         struct buffer_head *bh = head;
+       struct buffer_head *failed_bh;
  
-       /* Simple case, sync compaction */
-       if (mode != MIGRATE_ASYNC) {
-               do {
-                       lock_buffer(bh);
-                       bh = bh->b_this_page;
-
-               } while (bh != head);
-
-               return true;
-       }
-
-       /* async case, we cannot block on lock_buffer so use trylock_buffer */
         do {
                 if (!trylock_buffer(bh)) {
-                       /*
-                        * We failed to lock the buffer and cannot stall in
-                        * async migration. Release the taken locks
-                        */
-                       struct buffer_head *failed_bh = bh;
-                       bh = head;
-                       while (bh != failed_bh) {
-                               unlock_buffer(bh);
-                               bh = bh->b_this_page;
-                       }
-                       return false;
+                       if (mode == MIGRATE_ASYNC)
+                               goto unlock;
+                       if (mode == MIGRATE_SYNC_LIGHT && !buffer_uptodate(bh))
+                               goto unlock;
+                       lock_buffer(bh);
                 }
  
                 bh = bh->b_this_page;
         } while (bh != head);
+
         return true;
+
+unlock:
+       /* We failed to lock the buffer and cannot stall. */
+       failed_bh = bh;
+       bh = head;
+       while (bh != failed_bh) {
+               unlock_buffer(bh);
+               bh = bh->b_this_page;
+       }
+
+       return false;
  }
  
  static int __buffer_migrate_folio(struct address_space *mapping,
@@ -705,7 +756,7 @@ static int __buffer_migrate_folio(struct address_space *mapping,
  
  recheck_buffers:
                 busy = false;
-               spin_lock(&mapping->private_lock);
+               spin_lock(&mapping->i_private_lock);
                 bh = head;
                 do {
                         if (atomic_read(&bh->b_count)) {
@@ -719,7 +770,7 @@ recheck_buffers:
                                 rc = -EAGAIN;
                                 goto unlock_buffers;
                         }
-                       spin_unlock(&mapping->private_lock);
+                       spin_unlock(&mapping->i_private_lock);
                         invalidate_bh_lrus();
                         invalidated = true;
                         goto recheck_buffers;
@@ -734,7 +785,7 @@ recheck_buffers:
  
         bh = head;
         do {
-               set_bh_page(bh, &dst->page, bh_offset(bh));
+               folio_set_bh(bh, dst, bh_offset(bh));
                 bh = bh->b_this_page;
         } while (bh != head);
  
@@ -746,7 +797,7 @@ recheck_buffers:
         rc = MIGRATEPAGE_SUCCESS;
  unlock_buffers:
         if (check_refs)
-               spin_unlock(&mapping->private_lock);
+               spin_unlock(&mapping->i_private_lock);
         bh = head;
         do {
                 unlock_buffer(bh);
@@ -797,7 +848,8 @@ int buffer_migrate_folio_norefs(struct address_space *mapping,
  {
         return __buffer_migrate_folio(mapping, dst, src, mode, true);
  }
-#endif
+EXPORT_SYMBOL_GPL(buffer_migrate_folio_norefs);
+#endif /* CONFIG_BUFFER_HEAD */
  
  int filemap_migrate_folio(struct address_space *mapping,
                 struct folio *dst, struct folio *src, enum migrate_mode mode)
@@ -882,8 +934,7 @@ static int fallback_migrate_folio(struct address_space *mapping,
          * Buffers may be managed in a filesystem specific way.
          * We must have no buffers or drop them.
          */
-       if (folio_test_private(src) &&
-           !filemap_release_folio(src, GFP_KERNEL))
+       if (!filemap_release_folio(src, GFP_KERNEL))
                 return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
  
         return migrate_folio(mapping, dst, src, mode);
@@ -904,7 +955,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
                                 enum migrate_mode mode)
  {
         int rc = -EAGAIN;
-       bool is_lru = !__PageMovable(&src->page);
+       bool is_lru = !__folio_test_movable(src);
  
         VM_BUG_ON_FOLIO(!folio_test_locked(src), src);
         VM_BUG_ON_FOLIO(!folio_test_locked(dst), dst);
@@ -914,6 +965,8 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
  
                 if (!mapping)
                         rc = migrate_folio(mapping, dst, src, mode);
+               else if (mapping_unmovable(mapping))
+                       rc = -EOPNOTSUPP;
                 else if (mapping->a_ops->migrate_folio)
                         /*
                          * Most folios have a mapping and most filesystems
@@ -940,7 +993,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
                         goto out;
                 }
  
-               mops = page_movable_ops(&src->page);
+               mops = folio_movable_ops(src);
                 rc = mops->migrate_page(&dst->page, &src->page, mode);
                 WARN_ON_ONCE(rc == MIGRATEPAGE_SUCCESS &&
                                 !folio_test_isolated(src));
@@ -951,7 +1004,7 @@ static int move_to_new_folio(struct folio *dst, struct folio *src,
          * src is freed; but stats require that PageAnon be left as PageAnon.
          */
         if (rc == MIGRATEPAGE_SUCCESS) {
-               if (__PageMovable(&src->page)) {
+               if (__folio_test_movable(src)) {
                         VM_BUG_ON_FOLIO(!folio_test_isolated(src), src);
  
                         /*
@@ -976,18 +1029,116 @@ out:
         return rc;
  }
  
-static int __unmap_and_move(struct page *page, struct page *newpage,
-                               int force, enum migrate_mode mode)
+/*
+ * To record some information during migration, we use unused private
+ * field of struct folio of the newly allocated destination folio.
+ * This is safe because nobody is using it except us.
+ */
+enum {
+       PAGE_WAS_MAPPED = BIT(0),
+       PAGE_WAS_MLOCKED = BIT(1),
+       PAGE_OLD_STATES = PAGE_WAS_MAPPED | PAGE_WAS_MLOCKED,
+};
+
+static void __migrate_folio_record(struct folio *dst,
+                                  int old_page_state,
+                                  struct anon_vma *anon_vma)
+{
+       dst->private = (void *)anon_vma + old_page_state;
+}
+
+static void __migrate_folio_extract(struct folio *dst,
+                                  int *old_page_state,
+                                  struct anon_vma **anon_vmap)
+{
+       unsigned long private = (unsigned long)dst->private;
+
+       *anon_vmap = (struct anon_vma *)(private & ~PAGE_OLD_STATES);
+       *old_page_state = private & PAGE_OLD_STATES;
+       dst->private = NULL;
+}
+
+/* Restore the source folio to the original state upon failure */
+static void migrate_folio_undo_src(struct folio *src,
+                                  int page_was_mapped,
+                                  struct anon_vma *anon_vma,
+                                  bool locked,
+                                  struct list_head *ret)
+{
+       if (page_was_mapped)
+               remove_migration_ptes(src, src, false);
+       /* Drop an anon_vma reference if we took one */
+       if (anon_vma)
+               put_anon_vma(anon_vma);
+       if (locked)
+               folio_unlock(src);
+       if (ret)
+               list_move_tail(&src->lru, ret);
+}
+
+/* Restore the destination folio to the original state upon failure */
+static void migrate_folio_undo_dst(struct folio *dst, bool locked,
+               free_folio_t put_new_folio, unsigned long private)
+{
+       if (locked)
+               folio_unlock(dst);
+       if (put_new_folio)
+               put_new_folio(dst, private);
+       else
+               folio_put(dst);
+}
+
+/* Cleanup src folio upon migration success */
+static void migrate_folio_done(struct folio *src,
+                              enum migrate_reason reason)
+{
+       /*
+        * Compaction can migrate also non-LRU pages which are
+        * not accounted to NR_ISOLATED_*. They can be recognized
+        * as __folio_test_movable
+        */
+       if (likely(!__folio_test_movable(src)))
+               mod_node_page_state(folio_pgdat(src), NR_ISOLATED_ANON +
+                                   folio_is_file_lru(src), -folio_nr_pages(src));
+
+       if (reason != MR_MEMORY_FAILURE)
+               /* We release the page in page_handle_poison. */
+               folio_put(src);
+}
+
+/* Obtain the lock on page, remove all ptes. */
+static int migrate_folio_unmap(new_folio_t get_new_folio,
+               free_folio_t put_new_folio, unsigned long private,
+               struct folio *src, struct folio **dstp, enum migrate_mode mode,
+               enum migrate_reason reason, struct list_head *ret)
  {
-       struct folio *folio = page_folio(page);
-       struct folio *dst = page_folio(newpage);
+       struct folio *dst;
         int rc = -EAGAIN;
-       bool page_was_mapped = false;
+       int old_page_state = 0;
         struct anon_vma *anon_vma = NULL;
-       bool is_lru = !__PageMovable(page);
+       bool is_lru = !__folio_test_movable(src);
+       bool locked = false;
+       bool dst_locked = false;
+
+       if (folio_ref_count(src) == 1) {
+               /* Folio was freed from under us. So we are done. */
+               folio_clear_active(src);
+               folio_clear_unevictable(src);
+               /* free_pages_prepare() will clear PG_isolated. */
+               list_del(&src->lru);
+               migrate_folio_done(src, reason);
+               return MIGRATEPAGE_SUCCESS;
+       }
  
-       if (!trylock_page(page)) {
-               if (!force || mode == MIGRATE_ASYNC)
+       dst = get_new_folio(src, private);
+       if (!dst)
+               return -ENOMEM;
+       *dstp = dst;
+
+       dst->private = NULL;
+
+       if (!folio_trylock(src)) {
+               if (mode == MIGRATE_ASYNC)
                         goto out;
  
                 /*
@@ -1006,10 +1157,21 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                 if (current->flags & PF_MEMALLOC)
                         goto out;
  
-               lock_page(page);
+               /*
+                * In "light" mode, we can wait for transient locks (eg
+                * inserting a page into the page table), but it's not
+                * worth waiting for I/O.
+                */
+               if (mode == MIGRATE_SYNC_LIGHT && !folio_test_uptodate(src))
+                       goto out;
+
+               folio_lock(src);
         }
+       locked = true;
+       if (folio_test_mlocked(src))
+               old_page_state |= PAGE_WAS_MLOCKED;
  
-       if (PageWriteback(page)) {
+       if (folio_test_writeback(src)) {
                 /*
                  * Only in the case of a full synchronous migration is it
                  * necessary to wait for PageWriteback. In the async case,
@@ -1022,51 +1184,50 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
                         break;
                 default:
                         rc = -EBUSY;
-                       goto out_unlock;
+                       goto out;
                 }
-               if (!force)
-                       goto out_unlock;
-               wait_on_page_writeback(page);
+               folio_wait_writeback(src);
         }
  
         /*
-        * By try_to_migrate(), page->mapcount goes down to 0 here. In this case,
-        * we cannot notice that anon_vma is freed while we migrates a page.
+        * By try_to_migrate(), src->mapcount goes down to 0 here. In this case,
+        * we cannot notice that anon_vma is freed while we migrate a page.
          * This get_anon_vma() delays freeing anon_vma pointer until the end
          * of migration. File cache pages are no problem because of page_lock()
          * File Caches may use write_page() or lock_page() in migration, then,
          * just care Anon page here.
          *
-        * Only page_get_anon_vma() understands the subtleties of
+        * Only folio_get_anon_vma() understands the subtleties of
          * getting a hold on an anon_vma from outside one of its mms.
          * But if we cannot get anon_vma, then we won't need it anyway,
          * because that implies that the anon page is no longer mapped
          * (and cannot be remapped so long as we hold the page lock).
          */
-       if (PageAnon(page) && !PageKsm(page))
-               anon_vma = page_get_anon_vma(page);
+       if (folio_test_anon(src) && !folio_test_ksm(src))
+               anon_vma = folio_get_anon_vma(src);
  
         /*
          * Block others from accessing the new page when we get around to
          * establishing additional references. We are usually the only one
-        * holding a reference to newpage at this point. We used to have a BUG
-        * here if trylock_page(newpage) fails, but would like to allow for
-        * cases where there might be a race with the previous use of newpage.
+        * holding a reference to dst at this point. We used to have a BUG
+        * here if folio_trylock(dst) fails, but would like to allow for
+        * cases where there might be a race with the previous use of dst.
          * This is much like races on refcount of oldpage: just don't BUG().
          */
-       if (unlikely(!trylock_page(newpage)))
-               goto out_unlock;
+       if (unlikely(!folio_trylock(dst)))
+               goto out;
+       dst_locked = true;
  
         if (unlikely(!is_lru)) {
-               rc = move_to_new_folio(dst, folio, mode);
-               goto out_unlock_both;
+               __migrate_folio_record(dst, old_page_state, anon_vma);
+               return MIGRATEPAGE_UNMAP;
         }
  
         /*
          * Corner case handling:
          * 1. When a new swap-cache page is read into, it is added to the LRU
          * and treated as swapcache but it has no rmap yet.
-        * Calling try_to_unmap() against a page->mapping==NULL page will
+        * Calling try_to_unmap() against a src->mapping==NULL page will
          * trigger a BUG.  So handle it here.
          * 2. An orphaned page (see truncate_cleanup_page) might have
          * fs-private metadata. The page can be picked up due to memory
@@ -1074,135 +1235,115 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
          * invisible to the vm, so the page can not be migrated.  So try to
          * free the metadata, so the page can be freed.
          */
-       if (!page->mapping) {
-               VM_BUG_ON_PAGE(PageAnon(page), page);
-               if (page_has_private(page)) {
-                       try_to_free_buffers(folio);
-                       goto out_unlock_both;
+       if (!src->mapping) {
+               if (folio_test_private(src)) {
+                       try_to_free_buffers(src);
+                       goto out;
                 }
-       } else if (page_mapped(page)) {
+       } else if (folio_mapped(src)) {
                 /* Establish migration ptes */
-               VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
-                               page);
-               try_to_migrate(folio, 0);
-               page_was_mapped = true;
+               VM_BUG_ON_FOLIO(folio_test_anon(src) &&
+                              !folio_test_ksm(src) && !anon_vma, src);
+               try_to_migrate(src, mode == MIGRATE_ASYNC ? TTU_BATCH_FLUSH : 0);
+               old_page_state |= PAGE_WAS_MAPPED;
         }
  
-       if (!page_mapped(page))
-               rc = move_to_new_folio(dst, folio, mode);
-
-       /*
-        * When successful, push newpage to LRU immediately: so that if it
-        * turns out to be an mlocked page, remove_migration_ptes() will
-        * automatically build up the correct newpage->mlock_count for it.
-        *
-        * We would like to do something similar for the old page, when
-        * unsuccessful, and other cases when a page has been temporarily
-        * isolated from the unevictable LRU: but this case is the easiest.
-        */
-       if (rc == MIGRATEPAGE_SUCCESS) {
-               lru_cache_add(newpage);
-               if (page_was_mapped)
-                       lru_add_drain();
+       if (!folio_mapped(src)) {
+               __migrate_folio_record(dst, old_page_state, anon_vma);
+               return MIGRATEPAGE_UNMAP;
         }
  
-       if (page_was_mapped)
-               remove_migration_ptes(folio,
-                       rc == MIGRATEPAGE_SUCCESS ? dst : folio, false);
-
-out_unlock_both:
-       unlock_page(newpage);
-out_unlock:
-       /* Drop an anon_vma reference if we took one */
-       if (anon_vma)
-               put_anon_vma(anon_vma);
-       unlock_page(page);
  out:
         /*
-        * If migration is successful, decrease refcount of the newpage,
-        * which will not free the page because new page owner increased
-        * refcounter.
+        * A folio that has not been unmapped will be restored to
+        * right list unless we want to retry.
          */
-       if (rc == MIGRATEPAGE_SUCCESS)
-               put_page(newpage);
+       if (rc == -EAGAIN)
+               ret = NULL;
+
+       migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+                              anon_vma, locked, ret);
+       migrate_folio_undo_dst(dst, dst_locked, put_new_folio, private);
  
         return rc;
  }
  
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page,
-                                  free_page_t put_new_page,
-                                  unsigned long private, struct page *page,
-                                  int force, enum migrate_mode mode,
-                                  enum migrate_reason reason,
-                                  struct list_head *ret)
+/* Migrate the folio to the newly allocated folio in dst. */
+static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
+                             struct folio *src, struct folio *dst,
+                             enum migrate_mode mode, enum migrate_reason reason,
+                             struct list_head *ret)
  {
-       int rc = MIGRATEPAGE_SUCCESS;
-       struct page *newpage = NULL;
+       int rc;
+       int old_page_state = 0;
+       struct anon_vma *anon_vma = NULL;
+       bool is_lru = !__folio_test_movable(src);
+       struct list_head *prev;
  
-       if (!thp_migration_supported() && PageTransHuge(page))
-               return -ENOSYS;
+       __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+       prev = dst->lru.prev;
+       list_del(&dst->lru);
  
-       if (page_count(page) == 1) {
-               /* Page was freed from under us. So we are done. */
-               ClearPageActive(page);
-               ClearPageUnevictable(page);
-               /* free_pages_prepare() will clear PG_isolated. */
+       rc = move_to_new_folio(dst, src, mode);
+       if (rc)
                 goto out;
-       }
  
-       newpage = get_new_page(page, private);
-       if (!newpage)
-               return -ENOMEM;
+       if (unlikely(!is_lru))
+               goto out_unlock_both;
  
-       newpage->private = 0;
-       rc = __unmap_and_move(page, newpage, force, mode);
-       if (rc == MIGRATEPAGE_SUCCESS)
-               set_page_owner_migrate_reason(newpage, reason);
+       /*
+        * When successful, push dst to LRU immediately: so that if it
+        * turns out to be an mlocked page, remove_migration_ptes() will
+        * automatically build up the correct dst->mlock_count for it.
+        *
+        * We would like to do something similar for the old page, when
+        * unsuccessful, and other cases when a page has been temporarily
+        * isolated from the unevictable LRU: but this case is the easiest.
+        */
+       folio_add_lru(dst);
+       if (old_page_state & PAGE_WAS_MLOCKED)
+               lru_add_drain();
  
-out:
-       if (rc != -EAGAIN) {
-               /*
-                * A page that has been migrated has all references
-                * removed and will be freed. A page that has not been
-                * migrated will have kept its references and be restored.
-                */
-               list_del(&page->lru);
-       }
+       if (old_page_state & PAGE_WAS_MAPPED)
+               remove_migration_ptes(src, dst, false);
  
+out_unlock_both:
+       folio_unlock(dst);
+       set_page_owner_migrate_reason(&dst->page, reason);
         /*
-        * If migration is successful, releases reference grabbed during
-        * isolation. Otherwise, restore the page to right list unless
-        * we want to retry.
+        * If migration is successful, decrease refcount of dst,
+        * which will not free the page because new page owner increased
+        * refcounter.
          */
-       if (rc == MIGRATEPAGE_SUCCESS) {
-               /*
-                * Compaction can migrate also non-LRU pages which are
-                * not accounted to NR_ISOLATED_*. They can be recognized
-                * as __PageMovable
-                */
-               if (likely(!__PageMovable(page)))
-                       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-                                       page_is_file_lru(page), -thp_nr_pages(page));
+       folio_put(dst);
  
-               if (reason != MR_MEMORY_FAILURE)
-                       /*
-                        * We release the page in page_handle_poison.
-                        */
-                       put_page(page);
-       } else {
-               if (rc != -EAGAIN)
-                       list_add_tail(&page->lru, ret);
+       /*
+        * A folio that has been migrated has all references removed
+        * and will be freed.
+        */
+       list_del(&src->lru);
+       /* Drop an anon_vma reference if we took one */
+       if (anon_vma)
+               put_anon_vma(anon_vma);
+       folio_unlock(src);
+       migrate_folio_done(src, reason);
  
-               if (put_new_page)
-                       put_new_page(newpage, private);
-               else
-                       put_page(newpage);
+       return rc;
+out:
+       /*
+        * A folio that has not been migrated will be restored to
+        * right list unless we want to retry.
+        */
+       if (rc == -EAGAIN) {
+               list_add(&dst->lru, prev);
+               __migrate_folio_record(dst, old_page_state, anon_vma);
+               return rc;
         }
  
+       migrate_folio_undo_src(src, old_page_state & PAGE_WAS_MAPPED,
+                              anon_vma, true, ret);
+       migrate_folio_undo_dst(dst, true, put_new_folio, private);
+
         return rc;
  }
  
@@ -1224,43 +1365,28 @@ out:
   * because then pte is replaced with migration swap entry and direct I/O code
   * will wait in the page fault for migration to complete.
   */
-static int unmap_and_move_huge_page(new_page_t get_new_page,
-                               free_page_t put_new_page, unsigned long private,
-                               struct page *hpage, int force,
-                               enum migrate_mode mode, int reason,
-                               struct list_head *ret)
+static int unmap_and_move_huge_page(new_folio_t get_new_folio,
+               free_folio_t put_new_folio, unsigned long private,
+               struct folio *src, int force, enum migrate_mode mode,
+               int reason, struct list_head *ret)
  {
-       struct folio *dst, *src = page_folio(hpage);
+       struct folio *dst;
         int rc = -EAGAIN;
         int page_was_mapped = 0;
-       struct page *new_hpage;
         struct anon_vma *anon_vma = NULL;
         struct address_space *mapping = NULL;
  
-       /*
-        * Migratability of hugepages depends on architectures and their size.
-        * This check is necessary because some callers of hugepage migration
-        * like soft offline and memory hotremove don't walk through page
-        * tables or check whether the hugepage is pmd-based or not before
-        * kicking migration.
-        */
-       if (!hugepage_migration_supported(page_hstate(hpage))) {
-               list_move_tail(&hpage->lru, ret);
-               return -ENOSYS;
-       }
-
-       if (page_count(hpage) == 1) {
+       if (folio_ref_count(src) == 1) {
                 /* page was freed from under us. So we are done. */
-               putback_active_hugepage(hpage);
+               folio_putback_active_hugetlb(src);
                 return MIGRATEPAGE_SUCCESS;
         }
  
-       new_hpage = get_new_page(hpage, private);
-       if (!new_hpage)
+       dst = get_new_folio(src, private);
+       if (!dst)
                 return -ENOMEM;
-       dst = page_folio(new_hpage);
  
-       if (!trylock_page(hpage)) {
+       if (!folio_trylock(src)) {
                 if (!force)
                         goto out;
                 switch (mode) {
@@ -1270,36 +1396,36 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                 default:
                         goto out;
                 }
-               lock_page(hpage);
+               folio_lock(src);
         }
  
         /*
          * Check for pages which are in the process of being freed.  Without
-        * page_mapping() set, hugetlbfs specific move page routine will not
+        * folio_mapping() set, hugetlbfs specific move page routine will not
          * be called and we could leak usage counts for subpools.
          */
-       if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) {
+       if (hugetlb_folio_subpool(src) && !folio_mapping(src)) {
                 rc = -EBUSY;
                 goto out_unlock;
         }
  
-       if (PageAnon(hpage))
-               anon_vma = page_get_anon_vma(hpage);
+       if (folio_test_anon(src))
+               anon_vma = folio_get_anon_vma(src);
  
-       if (unlikely(!trylock_page(new_hpage)))
+       if (unlikely(!folio_trylock(dst)))
                 goto put_anon;
  
-       if (page_mapped(hpage)) {
+       if (folio_mapped(src)) {
                 enum ttu_flags ttu = 0;
  
-               if (!PageAnon(hpage)) {
+               if (!folio_test_anon(src)) {
                         /*
                          * In shared mappings, try_to_unmap could potentially
                          * call huge_pmd_unshare.  Because of this, take
                          * semaphore in write mode here and set TTU_RMAP_LOCKED
                          * to let lower levels know we have taken the lock.
                          */
-                       mapping = hugetlb_page_mapping_lock_write(hpage);
+                       mapping = hugetlb_page_mapping_lock_write(&src->page);
                         if (unlikely(!mapping))
                                 goto unlock_put_anon;
  
@@ -1313,7 +1439,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                         i_mmap_unlock_write(mapping);
         }
  
-       if (!page_mapped(hpage))
+       if (!folio_mapped(src))
                 rc = move_to_new_folio(dst, src, mode);
  
         if (page_was_mapped)
@@ -1321,264 +1447,567 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
                         rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
  
  unlock_put_anon:
-       unlock_page(new_hpage);
+       folio_unlock(dst);
  
  put_anon:
         if (anon_vma)
                 put_anon_vma(anon_vma);
  
         if (rc == MIGRATEPAGE_SUCCESS) {
-               move_hugetlb_state(hpage, new_hpage, reason);
-               put_new_page = NULL;
+               move_hugetlb_state(src, dst, reason);
+               put_new_folio = NULL;
         }
  
  out_unlock:
-       unlock_page(hpage);
+       folio_unlock(src);
  out:
         if (rc == MIGRATEPAGE_SUCCESS)
-               putback_active_hugepage(hpage);
+               folio_putback_active_hugetlb(src);
         else if (rc != -EAGAIN)
-               list_move_tail(&hpage->lru, ret);
+               list_move_tail(&src->lru, ret);
  
         /*
          * If migration was not successful and there's a freeing callback, use
          * it.  Otherwise, put_page() will drop the reference grabbed during
          * isolation.
          */
-       if (put_new_page)
-               put_new_page(new_hpage, private);
+       if (put_new_folio)
+               put_new_folio(dst, private);
         else
-               putback_active_hugepage(new_hpage);
+               folio_putback_active_hugetlb(dst);
  
         return rc;
  }
  
-static inline int try_split_thp(struct page *page, struct page **page2,
-                               struct list_head *from)
+static inline int try_split_folio(struct folio *folio, struct list_head *split_folios)
  {
-       int rc = 0;
+       int rc;
  
-       lock_page(page);
-       rc = split_huge_page_to_list(page, from);
-       unlock_page(page);
+       folio_lock(folio);
+       rc = split_folio_to_list(folio, split_folios);
+       folio_unlock(folio);
         if (!rc)
-               list_safe_reset_next(page, *page2, lru);
+               list_move_tail(&folio->lru, split_folios);
  
         return rc;
  }
  
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define NR_MAX_BATCHED_MIGRATION       HPAGE_PMD_NR
+#else
+#define NR_MAX_BATCHED_MIGRATION       512
+#endif
+#define NR_MAX_MIGRATE_PAGES_RETRY     10
+#define NR_MAX_MIGRATE_ASYNC_RETRY     3
+#define NR_MAX_MIGRATE_SYNC_RETRY                                      \
+       (NR_MAX_MIGRATE_PAGES_RETRY - NR_MAX_MIGRATE_ASYNC_RETRY)
+
+struct migrate_pages_stats {
+       int nr_succeeded;       /* Normal and large folios migrated successfully, in
+                                  units of base pages */
+       int nr_failed_pages;    /* Normal and large folios failed to be migrated, in
+                                  units of base pages.  Untried folios aren't counted */
+       int nr_thp_succeeded;   /* THP migrated successfully */
+       int nr_thp_failed;      /* THP failed to be migrated */
+       int nr_thp_split;       /* THP split before migrating */
+       int nr_split;   /* Large folio (include THP) split before migrating */
+};
+
  /*
- * migrate_pages - migrate the pages specified in a list, to the free pages
- *                supplied as the target for the page migration
- *
- * @from:              The list of pages to be migrated.
- * @get_new_page:      The function used to allocate free pages to be used
- *                     as the target of the page migration.
- * @put_new_page:      The function used to free target pages if migration
- *                     fails, or NULL if no special handling is necessary.
- * @private:           Private data to be passed on to get_new_page()
- * @mode:              The migration mode that specifies the constraints for
- *                     page migration, if any.
- * @reason:            The reason for page migration.
- * @ret_succeeded:     Set to the number of normal pages migrated successfully if
- *                     the caller passes a non-NULL pointer.
- *
- * The function returns after 10 attempts or if no pages are movable any more
- * because the list has become empty or no retryable pages exist any more.
- * It is caller's responsibility to call putback_movable_pages() to return pages
- * to the LRU or free list only if ret != 0.
- *
- * Returns the number of {normal page, THP, hugetlb} that were not migrated, or
- * an error code. The number of THP splits will be considered as the number of
- * non-migrated THP, no matter how many subpages of the THP are migrated successfully.
+ * Returns the number of hugetlb folios that were not migrated, or an error code
+ * after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no hugetlb folios are movable
+ * any more because the list has become empty or no retryable hugetlb folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
   */
-int migrate_pages(struct list_head *from, new_page_t get_new_page,
-               free_page_t put_new_page, unsigned long private,
-               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+static int migrate_hugetlbs(struct list_head *from, new_folio_t get_new_folio,
+                           free_folio_t put_new_folio, unsigned long private,
+                           enum migrate_mode mode, int reason,
+                           struct migrate_pages_stats *stats,
+                           struct list_head *ret_folios)
  {
         int retry = 1;
-       int thp_retry = 1;
         int nr_failed = 0;
-       int nr_failed_pages = 0;
-       int nr_succeeded = 0;
-       int nr_thp_succeeded = 0;
-       int nr_thp_failed = 0;
-       int nr_thp_split = 0;
+       int nr_retry_pages = 0;
         int pass = 0;
-       bool is_thp = false;
-       struct page *page;
-       struct page *page2;
-       int rc, nr_subpages;
-       LIST_HEAD(ret_pages);
-       LIST_HEAD(thp_split_pages);
-       bool nosplit = (reason == MR_NUMA_MISPLACED);
-       bool no_subpage_counting = false;
+       struct folio *folio, *folio2;
+       int rc, nr_pages;
  
-       trace_mm_migrate_pages_start(mode, reason);
-
-thp_subpage_migration:
-       for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
+       for (pass = 0; pass < NR_MAX_MIGRATE_PAGES_RETRY && retry; pass++) {
                 retry = 0;
-               thp_retry = 0;
+               nr_retry_pages = 0;
+
+               list_for_each_entry_safe(folio, folio2, from, lru) {
+                       if (!folio_test_hugetlb(folio))
+                               continue;
+
+                       nr_pages = folio_nr_pages(folio);
+
+                       cond_resched();
  
-               list_for_each_entry_safe(page, page2, from, lru) {
-retry:
                         /*
-                        * THP statistics is based on the source huge page.
-                        * Capture required information that might get lost
-                        * during migration.
+                        * Migratability of hugepages depends on architectures and
+                        * their size.  This check is necessary because some callers
+                        * of hugepage migration like soft offline and memory
+                        * hotremove don't walk through page tables or check whether
+                        * the hugepage is pmd-based or not before kicking migration.
                          */
-                       is_thp = PageTransHuge(page) && !PageHuge(page);
-                       nr_subpages = compound_nr(page);
-                       cond_resched();
+                       if (!hugepage_migration_supported(folio_hstate(folio))) {
+                               nr_failed++;
+                               stats->nr_failed_pages += nr_pages;
+                               list_move_tail(&folio->lru, ret_folios);
+                               continue;
+                       }
  
-                       if (PageHuge(page))
-                               rc = unmap_and_move_huge_page(get_new_page,
-                                               put_new_page, private, page,
-                                               pass > 2, mode, reason,
-                                               &ret_pages);
-                       else
-                               rc = unmap_and_move(get_new_page, put_new_page,
-                                               private, page, pass > 2, mode,
-                                               reason, &ret_pages);
+                       rc = unmap_and_move_huge_page(get_new_folio,
+                                                     put_new_folio, private,
+                                                     folio, pass > 2, mode,
+                                                     reason, ret_folios);
                         /*
                          * The rules are:
-                        *      Success: non hugetlb page will be freed, hugetlb
-                        *               page will be put back
+                        *      Success: hugetlb folio will be put back
                          *      -EAGAIN: stay on the from list
                          *      -ENOMEM: stay on the from list
-                        *      Other errno: put on ret_pages list then splice to
-                        *                   from list
+                        *      Other errno: put on ret_folios list
                          */
                         switch(rc) {
-                       /*
-                        * THP migration might be unsupported or the
-                        * allocation could've failed so we should
-                        * retry on the same page with the THP split
-                        * to base pages.
-                        *
-                        * Head page is retried immediately and tail
-                        * pages are added to the tail of the list so
-                        * we encounter them after the rest of the list
-                        * is processed.
-                        */
-                       case -ENOSYS:
-                               /* THP migration is unsupported */
-                               if (is_thp) {
-                                       nr_thp_failed++;
-                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
-                                               nr_thp_split++;
-                                               goto retry;
-                                       }
-                               /* Hugetlb migration is unsupported */
-                               } else if (!no_subpage_counting) {
-                                       nr_failed++;
-                               }
-
-                               nr_failed_pages += nr_subpages;
-                               break;
                         case -ENOMEM:
                                 /*
                                  * When memory is low, don't bother to try to migrate
-                                * other pages, just exit.
-                                * THP NUMA faulting doesn't split THP to retry.
-                                */
-                               if (is_thp && !nosplit) {
-                                       nr_thp_failed++;
-                                       if (!try_split_thp(page, &page2, &thp_split_pages)) {
-                                               nr_thp_split++;
-                                               goto retry;
-                                       }
-                               } else if (!no_subpage_counting) {
-                                       nr_failed++;
-                               }
-
-                               nr_failed_pages += nr_subpages;
-                               /*
-                                * There might be some subpages of fail-to-migrate THPs
-                                * left in thp_split_pages list. Move them back to migration
-                                * list so that they could be put back to the right list by
-                                * the caller otherwise the page refcnt will be leaked.
+                                * other folios, just exit.
                                  */
-                               list_splice_init(&thp_split_pages, from);
-                               nr_thp_failed += thp_retry;
-                               goto out;
+                               stats->nr_failed_pages += nr_pages + nr_retry_pages;
+                               return -ENOMEM;
                         case -EAGAIN:
-                               if (is_thp)
-                                       thp_retry++;
-                               else
-                                       retry++;
+                               retry++;
+                               nr_retry_pages += nr_pages;
                                 break;
                         case MIGRATEPAGE_SUCCESS:
-                               nr_succeeded += nr_subpages;
-                               if (is_thp)
-                                       nr_thp_succeeded++;
+                               stats->nr_succeeded += nr_pages;
                                 break;
                         default:
                                 /*
                                  * Permanent failure (-EBUSY, etc.):
-                                * unlike -EAGAIN case, the failed page is
-                                * removed from migration page list and not
+                                * unlike -EAGAIN case, the failed folio is
+                                * removed from migration folio list and not
                                  * retried in the next outer loop.
                                  */
-                               if (is_thp)
-                                       nr_thp_failed++;
-                               else if (!no_subpage_counting)
-                                       nr_failed++;
-
-                               nr_failed_pages += nr_subpages;
+                               nr_failed++;
+                               stats->nr_failed_pages += nr_pages;
                                 break;
                         }
                 }
         }
-       nr_failed += retry;
-       nr_thp_failed += thp_retry;
         /*
-        * Try to migrate subpages of fail-to-migrate THPs, no nr_failed
-        * counting in this round, since all subpages of a THP is counted
-        * as 1 failure in the first round.
+        * nr_failed is number of hugetlb folios failed to be migrated.  After
+        * NR_MAX_MIGRATE_PAGES_RETRY attempts, give up and count retried hugetlb
+        * folios as failed.
          */
-       if (!list_empty(&thp_split_pages)) {
-               /*
-                * Move non-migrated pages (after 10 retries) to ret_pages
-                * to avoid migrating them again.
-                */
-               list_splice_init(from, &ret_pages);
-               list_splice_init(&thp_split_pages, from);
-               no_subpage_counting = true;
-               retry = 1;
-               goto thp_subpage_migration;
-       }
+       nr_failed += retry;
+       stats->nr_failed_pages += nr_retry_pages;
  
-       rc = nr_failed + nr_thp_failed;
-out:
-       /*
-        * Put the permanent failure page back to migration list, they
+       return nr_failed;
+}
+
+/*
+ * migrate_pages_batch() first unmaps folios in the from list as many as
+ * possible, then move the unmapped folios.
+ *
+ * We only batch migration if mode == MIGRATE_ASYNC to avoid to wait a
+ * lock or bit when we have locked more than one folio.  Which may cause
+ * deadlock (e.g., for loop device).  So, if mode != MIGRATE_ASYNC, the
+ * length of the from list must be <= 1.
+ */
+static int migrate_pages_batch(struct list_head *from,
+               new_folio_t get_new_folio, free_folio_t put_new_folio,
+               unsigned long private, enum migrate_mode mode, int reason,
+               struct list_head *ret_folios, struct list_head *split_folios,
+               struct migrate_pages_stats *stats, int nr_pass)
+{
+       int retry = 1;
+       int thp_retry = 1;
+       int nr_failed = 0;
+       int nr_retry_pages = 0;
+       int pass = 0;
+       bool is_thp = false;
+       bool is_large = false;
+       struct folio *folio, *folio2, *dst = NULL, *dst2;
+       int rc, rc_saved = 0, nr_pages;
+       LIST_HEAD(unmap_folios);
+       LIST_HEAD(dst_folios);
+       bool nosplit = (reason == MR_NUMA_MISPLACED);
+
+       VM_WARN_ON_ONCE(mode != MIGRATE_ASYNC &&
+                       !list_empty(from) && !list_is_singular(from));
+
+       for (pass = 0; pass < nr_pass && retry; pass++) {
+               retry = 0;
+               thp_retry = 0;
+               nr_retry_pages = 0;
+
+               list_for_each_entry_safe(folio, folio2, from, lru) {
+                       is_large = folio_test_large(folio);
+                       is_thp = is_large && folio_test_pmd_mappable(folio);
+                       nr_pages = folio_nr_pages(folio);
+
+                       cond_resched();
+
+                       /*
+                        * Large folio migration might be unsupported or
+                        * the allocation might be failed so we should retry
+                        * on the same folio with the large folio split
+                        * to normal folios.
+                        *
+                        * Split folios are put in split_folios, and
+                        * we will migrate them after the rest of the
+                        * list is processed.
+                        */
+                       if (!thp_migration_supported() && is_thp) {
+                               nr_failed++;
+                               stats->nr_thp_failed++;
+                               if (!try_split_folio(folio, split_folios)) {
+                                       stats->nr_thp_split++;
+                                       stats->nr_split++;
+                                       continue;
+                               }
+                               stats->nr_failed_pages += nr_pages;
+                               list_move_tail(&folio->lru, ret_folios);
+                               continue;
+                       }
+
+                       rc = migrate_folio_unmap(get_new_folio, put_new_folio,
+                                       private, folio, &dst, mode, reason,
+                                       ret_folios);
+                       /*
+                        * The rules are:
+                        *      Success: folio will be freed
+                        *      Unmap: folio will be put on unmap_folios list,
+                        *             dst folio put on dst_folios list
+                        *      -EAGAIN: stay on the from list
+                        *      -ENOMEM: stay on the from list
+                        *      Other errno: put on ret_folios list
+                        */
+                       switch(rc) {
+                       case -ENOMEM:
+                               /*
+                                * When memory is low, don't bother to try to migrate
+                                * other folios, move unmapped folios, then exit.
+                                */
+                               nr_failed++;
+                               stats->nr_thp_failed += is_thp;
+                               /* Large folio NUMA faulting doesn't split to retry. */
+                               if (is_large && !nosplit) {
+                                       int ret = try_split_folio(folio, split_folios);
+
+                                       if (!ret) {
+                                               stats->nr_thp_split += is_thp;
+                                               stats->nr_split++;
+                                               break;
+                                       } else if (reason == MR_LONGTERM_PIN &&
+                                                  ret == -EAGAIN) {
+                                               /*
+                                                * Try again to split large folio to
+                                                * mitigate the failure of longterm pinning.
+                                                */
+                                               retry++;
+                                               thp_retry += is_thp;
+                                               nr_retry_pages += nr_pages;
+                                               /* Undo duplicated failure counting. */
+                                               nr_failed--;
+                                               stats->nr_thp_failed -= is_thp;
+                                               break;
+                                       }
+                               }
+
+                               stats->nr_failed_pages += nr_pages + nr_retry_pages;
+                               /* nr_failed isn't updated for not used */
+                               stats->nr_thp_failed += thp_retry;
+                               rc_saved = rc;
+                               if (list_empty(&unmap_folios))
+                                       goto out;
+                               else
+                                       goto move;
+                       case -EAGAIN:
+                               retry++;
+                               thp_retry += is_thp;
+                               nr_retry_pages += nr_pages;
+                               break;
+                       case MIGRATEPAGE_SUCCESS:
+                               stats->nr_succeeded += nr_pages;
+                               stats->nr_thp_succeeded += is_thp;
+                               break;
+                       case MIGRATEPAGE_UNMAP:
+                               list_move_tail(&folio->lru, &unmap_folios);
+                               list_add_tail(&dst->lru, &dst_folios);
+                               break;
+                       default:
+                               /*
+                                * Permanent failure (-EBUSY, etc.):
+                                * unlike -EAGAIN case, the failed folio is
+                                * removed from migration folio list and not
+                                * retried in the next outer loop.
+                                */
+                               nr_failed++;
+                               stats->nr_thp_failed += is_thp;
+                               stats->nr_failed_pages += nr_pages;
+                               break;
+                       }
+               }
+       }
+       nr_failed += retry;
+       stats->nr_thp_failed += thp_retry;
+       stats->nr_failed_pages += nr_retry_pages;
+move:
+       /* Flush TLBs for all unmapped folios */
+       try_to_unmap_flush();
+
+       retry = 1;
+       for (pass = 0; pass < nr_pass && retry; pass++) {
+               retry = 0;
+               thp_retry = 0;
+               nr_retry_pages = 0;
+
+               dst = list_first_entry(&dst_folios, struct folio, lru);
+               dst2 = list_next_entry(dst, lru);
+               list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+                       is_thp = folio_test_large(folio) && folio_test_pmd_mappable(folio);
+                       nr_pages = folio_nr_pages(folio);
+
+                       cond_resched();
+
+                       rc = migrate_folio_move(put_new_folio, private,
+                                               folio, dst, mode,
+                                               reason, ret_folios);
+                       /*
+                        * The rules are:
+                        *      Success: folio will be freed
+                        *      -EAGAIN: stay on the unmap_folios list
+                        *      Other errno: put on ret_folios list
+                        */
+                       switch(rc) {
+                       case -EAGAIN:
+                               retry++;
+                               thp_retry += is_thp;
+                               nr_retry_pages += nr_pages;
+                               break;
+                       case MIGRATEPAGE_SUCCESS:
+                               stats->nr_succeeded += nr_pages;
+                               stats->nr_thp_succeeded += is_thp;
+                               break;
+                       default:
+                               nr_failed++;
+                               stats->nr_thp_failed += is_thp;
+                               stats->nr_failed_pages += nr_pages;
+                               break;
+                       }
+                       dst = dst2;
+                       dst2 = list_next_entry(dst, lru);
+               }
+       }
+       nr_failed += retry;
+       stats->nr_thp_failed += thp_retry;
+       stats->nr_failed_pages += nr_retry_pages;
+
+       rc = rc_saved ? : nr_failed;
+out:
+       /* Cleanup remaining folios */
+       dst = list_first_entry(&dst_folios, struct folio, lru);
+       dst2 = list_next_entry(dst, lru);
+       list_for_each_entry_safe(folio, folio2, &unmap_folios, lru) {
+               int old_page_state = 0;
+               struct anon_vma *anon_vma = NULL;
+
+               __migrate_folio_extract(dst, &old_page_state, &anon_vma);
+               migrate_folio_undo_src(folio, old_page_state & PAGE_WAS_MAPPED,
+                                      anon_vma, true, ret_folios);
+               list_del(&dst->lru);
+               migrate_folio_undo_dst(dst, true, put_new_folio, private);
+               dst = dst2;
+               dst2 = list_next_entry(dst, lru);
+       }
+
+       return rc;
+}
+
+static int migrate_pages_sync(struct list_head *from, new_folio_t get_new_folio,
+               free_folio_t put_new_folio, unsigned long private,
+               enum migrate_mode mode, int reason,
+               struct list_head *ret_folios, struct list_head *split_folios,
+               struct migrate_pages_stats *stats)
+{
+       int rc, nr_failed = 0;
+       LIST_HEAD(folios);
+       struct migrate_pages_stats astats;
+
+       memset(&astats, 0, sizeof(astats));
+       /* Try to migrate in batch with MIGRATE_ASYNC mode firstly */
+       rc = migrate_pages_batch(from, get_new_folio, put_new_folio, private, MIGRATE_ASYNC,
+                                reason, &folios, split_folios, &astats,
+                                NR_MAX_MIGRATE_ASYNC_RETRY);
+       stats->nr_succeeded += astats.nr_succeeded;
+       stats->nr_thp_succeeded += astats.nr_thp_succeeded;
+       stats->nr_thp_split += astats.nr_thp_split;
+       stats->nr_split += astats.nr_split;
+       if (rc < 0) {
+               stats->nr_failed_pages += astats.nr_failed_pages;
+               stats->nr_thp_failed += astats.nr_thp_failed;
+               list_splice_tail(&folios, ret_folios);
+               return rc;
+       }
+       stats->nr_thp_failed += astats.nr_thp_split;
+       /*
+        * Do not count rc, as pages will be retried below.
+        * Count nr_split only, since it includes nr_thp_split.
+        */
+       nr_failed += astats.nr_split;
+       /*
+        * Fall back to migrate all failed folios one by one synchronously. All
+        * failed folios except split THPs will be retried, so their failure
+        * isn't counted
+        */
+       list_splice_tail_init(&folios, from);
+       while (!list_empty(from)) {
+               list_move(from->next, &folios);
+               rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+                                        private, mode, reason, ret_folios,
+                                        split_folios, stats, NR_MAX_MIGRATE_SYNC_RETRY);
+               list_splice_tail_init(&folios, ret_folios);
+               if (rc < 0)
+                       return rc;
+               nr_failed += rc;
+       }
+
+       return nr_failed;
+}
+
+/*
+ * migrate_pages - migrate the folios specified in a list, to the free folios
+ *                supplied as the target for the page migration
+ *
+ * @from:              The list of folios to be migrated.
+ * @get_new_folio:     The function used to allocate free folios to be used
+ *                     as the target of the folio migration.
+ * @put_new_folio:     The function used to free target folios if migration
+ *                     fails, or NULL if no special handling is necessary.
+ * @private:           Private data to be passed on to get_new_folio()
+ * @mode:              The migration mode that specifies the constraints for
+ *                     folio migration, if any.
+ * @reason:            The reason for folio migration.
+ * @ret_succeeded:     Set to the number of folios migrated successfully if
+ *                     the caller passes a non-NULL pointer.
+ *
+ * The function returns after NR_MAX_MIGRATE_PAGES_RETRY attempts or if no folios
+ * are movable any more because the list has become empty or no retryable folios
+ * exist any more. It is caller's responsibility to call putback_movable_pages()
+ * only if ret != 0.
+ *
+ * Returns the number of {normal folio, large folio, hugetlb} that were not
+ * migrated, or an error code. The number of large folio splits will be
+ * considered as the number of non-migrated large folio, no matter how many
+ * split folios of the large folio are migrated successfully.
+ */
+int migrate_pages(struct list_head *from, new_folio_t get_new_folio,
+               free_folio_t put_new_folio, unsigned long private,
+               enum migrate_mode mode, int reason, unsigned int *ret_succeeded)
+{
+       int rc, rc_gather;
+       int nr_pages;
+       struct folio *folio, *folio2;
+       LIST_HEAD(folios);
+       LIST_HEAD(ret_folios);
+       LIST_HEAD(split_folios);
+       struct migrate_pages_stats stats;
+
+       trace_mm_migrate_pages_start(mode, reason);
+
+       memset(&stats, 0, sizeof(stats));
+
+       rc_gather = migrate_hugetlbs(from, get_new_folio, put_new_folio, private,
+                                    mode, reason, &stats, &ret_folios);
+       if (rc_gather < 0)
+               goto out;
+
+again:
+       nr_pages = 0;
+       list_for_each_entry_safe(folio, folio2, from, lru) {
+               /* Retried hugetlb folios will be kept in list  */
+               if (folio_test_hugetlb(folio)) {
+                       list_move_tail(&folio->lru, &ret_folios);
+                       continue;
+               }
+
+               nr_pages += folio_nr_pages(folio);
+               if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+                       break;
+       }
+       if (nr_pages >= NR_MAX_BATCHED_MIGRATION)
+               list_cut_before(&folios, from, &folio2->lru);
+       else
+               list_splice_init(from, &folios);
+       if (mode == MIGRATE_ASYNC)
+               rc = migrate_pages_batch(&folios, get_new_folio, put_new_folio,
+                               private, mode, reason, &ret_folios,
+                               &split_folios, &stats,
+                               NR_MAX_MIGRATE_PAGES_RETRY);
+       else
+               rc = migrate_pages_sync(&folios, get_new_folio, put_new_folio,
+                               private, mode, reason, &ret_folios,
+                               &split_folios, &stats);
+       list_splice_tail_init(&folios, &ret_folios);
+       if (rc < 0) {
+               rc_gather = rc;
+               list_splice_tail(&split_folios, &ret_folios);
+               goto out;
+       }
+       if (!list_empty(&split_folios)) {
+               /*
+                * Failure isn't counted since all split folios of a large folio
+                * is counted as 1 failure already.  And, we only try to migrate
+                * with minimal effort, force MIGRATE_ASYNC mode and retry once.
+                */
+               migrate_pages_batch(&split_folios, get_new_folio,
+                               put_new_folio, private, MIGRATE_ASYNC, reason,
+                               &ret_folios, NULL, &stats, 1);
+               list_splice_tail_init(&split_folios, &ret_folios);
+       }
+       rc_gather += rc;
+       if (!list_empty(from))
+               goto again;
+out:
+       /*
+        * Put the permanent failure folio back to migration list, they
          * will be put back to the right list by the caller.
          */
-       list_splice(&ret_pages, from);
+       list_splice(&ret_folios, from);
  
-       count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
-       count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
-       count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
-       count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
-       count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
-       trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
-                              nr_thp_failed, nr_thp_split, mode, reason);
+       /*
+        * Return 0 in case all split folios of fail-to-migrate large folios
+        * are migrated successfully.
+        */
+       if (list_empty(from))
+               rc_gather = 0;
+
+       count_vm_events(PGMIGRATE_SUCCESS, stats.nr_succeeded);
+       count_vm_events(PGMIGRATE_FAIL, stats.nr_failed_pages);
+       count_vm_events(THP_MIGRATION_SUCCESS, stats.nr_thp_succeeded);
+       count_vm_events(THP_MIGRATION_FAIL, stats.nr_thp_failed);
+       count_vm_events(THP_MIGRATION_SPLIT, stats.nr_thp_split);
+       trace_mm_migrate_pages(stats.nr_succeeded, stats.nr_failed_pages,
+                              stats.nr_thp_succeeded, stats.nr_thp_failed,
+                              stats.nr_thp_split, stats.nr_split, mode,
+                              reason);
  
         if (ret_succeeded)
-               *ret_succeeded = nr_succeeded;
+               *ret_succeeded = stats.nr_succeeded;
  
-       return rc;
+       return rc_gather;
  }
  
-struct page *alloc_migration_target(struct page *page, unsigned long private)
+struct folio *alloc_migration_target(struct folio *src, unsigned long private)
  {
-       struct folio *folio = page_folio(page);
         struct migration_target_control *mtc;
         gfp_t gfp_mask;
         unsigned int order = 0;
-       struct folio *new_folio = NULL;
         int nid;
         int zidx;
  
@@ -1586,31 +2015,30 @@ struct page *alloc_migration_target(struct page *page, unsigned long private)
         gfp_mask = mtc->gfp_mask;
         nid = mtc->nid;
         if (nid == NUMA_NO_NODE)
-               nid = folio_nid(folio);
+               nid = folio_nid(src);
  
-       if (folio_test_hugetlb(folio)) {
-               struct hstate *h = page_hstate(&folio->page);
+       if (folio_test_hugetlb(src)) {
+               struct hstate *h = folio_hstate(src);
  
                 gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
-               return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+               return alloc_hugetlb_folio_nodemask(h, nid,
+                                               mtc->nmask, gfp_mask);
         }
  
-       if (folio_test_large(folio)) {
+       if (folio_test_large(src)) {
                 /*
                  * clear __GFP_RECLAIM to make the migration callback
                  * consistent with regular THP allocations.
                  */
                 gfp_mask &= ~__GFP_RECLAIM;
                 gfp_mask |= GFP_TRANSHUGE;
-               order = folio_order(folio);
+               order = folio_order(src);
         }
-       zidx = zone_idx(folio_zone(folio));
+       zidx = zone_idx(folio_zone(src));
         if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
                 gfp_mask |= __GFP_HIGHMEM;
  
-       new_folio = __folio_alloc(gfp_mask, order, nid, mtc->nmask);
-
-       return &new_folio->page;
+       return __folio_alloc(gfp_mask, order, nid, mtc->nmask);
  }
  
  #ifdef CONFIG_NUMA
@@ -1626,8 +2054,7 @@ static int store_status(int __user *status, int start, int value, int nr)
         return 0;
  }
  
-static int do_move_pages_to_node(struct mm_struct *mm,
-               struct list_head *pagelist, int node)
+static int do_move_pages_to_node(struct list_head *pagelist, int node)
  {
         int err;
         struct migration_target_control mtc = {
@@ -1651,14 +2078,18 @@ static int do_move_pages_to_node(struct mm_struct *mm,
   *         target node
   *     1 - when it has been queued
   */
-static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
+static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
                 int node, struct list_head *pagelist, bool migrate_all)
  {
         struct vm_area_struct *vma;
+       unsigned long addr;
         struct page *page;
+       struct folio *folio;
         int err;
  
         mmap_read_lock(mm);
+       addr = (unsigned long)untagged_addr_remote(mm, p);
+
         err = -EFAULT;
         vma = vma_lookup(mm, addr);
         if (!vma || !vma_migratable(vma))
@@ -1672,50 +2103,47 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
                 goto out;
  
         err = -ENOENT;
-       if (!page || is_zone_device_page(page))
+       if (!page)
                 goto out;
  
+       folio = page_folio(page);
+       if (folio_is_zone_device(folio))
+               goto out_putfolio;
+
         err = 0;
-       if (page_to_nid(page) == node)
-               goto out_putpage;
+       if (folio_nid(folio) == node)
+               goto out_putfolio;
  
         err = -EACCES;
         if (page_mapcount(page) > 1 && !migrate_all)
-               goto out_putpage;
+               goto out_putfolio;
  
-       if (PageHuge(page)) {
-               if (PageHead(page)) {
-                       err = isolate_hugetlb(page, pagelist);
-                       if (!err)
-                               err = 1;
-               }
+       err = -EBUSY;
+       if (folio_test_hugetlb(folio)) {
+               if (isolate_hugetlb(folio, pagelist))
+                       err = 1;
         } else {
-               struct page *head;
-
-               head = compound_head(page);
-               err = isolate_lru_page(head);
-               if (err)
-                       goto out_putpage;
+               if (!folio_isolate_lru(folio))
+                       goto out_putfolio;
  
                 err = 1;
-               list_add_tail(&head->lru, pagelist);
-               mod_node_page_state(page_pgdat(head),
-                       NR_ISOLATED_ANON + page_is_file_lru(head),
-                       thp_nr_pages(head));
+               list_add_tail(&folio->lru, pagelist);
+               node_stat_mod_folio(folio,
+                       NR_ISOLATED_ANON + folio_is_file_lru(folio),
+                       folio_nr_pages(folio));
         }
-out_putpage:
+out_putfolio:
         /*
-        * Either remove the duplicate refcount from
-        * isolate_lru_page() or drop the page ref if it was
-        * not isolated.
+        * Either remove the duplicate refcount from folio_isolate_lru()
+        * or drop the folio ref if it was not isolated.
          */
-       put_page(page);
+       folio_put(folio);
  out:
         mmap_read_unlock(mm);
         return err;
  }
  
-static int move_pages_and_store_status(struct mm_struct *mm, int node,
+static int move_pages_and_store_status(int node,
                 struct list_head *pagelist, int __user *status,
                 int start, int i, unsigned long nr_pages)
  {
@@ -1724,7 +2152,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
         if (list_empty(pagelist))
                 return 0;
  
-       err = do_move_pages_to_node(mm, pagelist, node);
+       err = do_move_pages_to_node(pagelist, node);
         if (err) {
                 /*
                  * Positive err means the number of failed
@@ -1735,7 +2163,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
                  * well.
                  */
                 if (err > 0)
-                       err += nr_pages - i - 1;
+                       err += nr_pages - i;
                 return err;
         }
         return store_status(status, start, node, i - start);
@@ -1751,6 +2179,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                          const int __user *nodes,
                          int __user *status, int flags)
  {
+       compat_uptr_t __user *compat_pages = (void __user *)pages;
         int current_node = NUMA_NO_NODE;
         LIST_HEAD(pagelist);
         int start, i;
@@ -1760,15 +2189,22 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
  
         for (i = start = 0; i < nr_pages; i++) {
                 const void __user *p;
-               unsigned long addr;
                 int node;
  
                 err = -EFAULT;
-               if (get_user(p, pages + i))
-                       goto out_flush;
+               if (in_compat_syscall()) {
+                       compat_uptr_t cp;
+
+                       if (get_user(cp, compat_pages + i))
+                               goto out_flush;
+
+                       p = compat_ptr(cp);
+               } else {
+                       if (get_user(p, pages + i))
+                               goto out_flush;
+               }
                 if (get_user(node, nodes + i))
                         goto out_flush;
-               addr = (unsigned long)untagged_addr(p);
  
                 err = -ENODEV;
                 if (node < 0 || node >= MAX_NUMNODES)
@@ -1784,7 +2220,7 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                         current_node = node;
                         start = i;
                 } else if (node != current_node) {
-                       err = move_pages_and_store_status(mm, current_node,
+                       err = move_pages_and_store_status(current_node,
                                         &pagelist, status, start, i, nr_pages);
                         if (err)
                                 goto out;
@@ -1796,8 +2232,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                  * Errors in the page lookup or isolation are not fatal and we simply
                  * report them via status
                  */
-               err = add_page_for_migration(mm, addr, current_node,
-                               &pagelist, flags & MPOL_MF_MOVE_ALL);
+               err = add_page_for_migration(mm, p, current_node, &pagelist,
+                                            flags & MPOL_MF_MOVE_ALL);
  
                 if (err > 0) {
                         /* The page is successfully queued for migration */
@@ -1819,15 +2255,19 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
                 if (err)
                         goto out_flush;
  
-               err = move_pages_and_store_status(mm, current_node, &pagelist,
+               err = move_pages_and_store_status(current_node, &pagelist,
                                 status, start, i, nr_pages);
-               if (err)
+               if (err) {
+                       /* We have accounted for page i */
+                       if (err > 0)
+                               err--;
                         goto out;
+               }
                 current_node = NUMA_NO_NODE;
         }
  out_flush:
         /* Make sure we do not overwrite the existing error */
-       err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+       err1 = move_pages_and_store_status(current_node, &pagelist,
                                 status, start, i, nr_pages);
         if (err >= 0)
                 err = err1;
@@ -1863,12 +2303,14 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
                 if (IS_ERR(page))
                         goto set_status;
  
-               if (page && !is_zone_device_page(page)) {
+               err = -ENOENT;
+               if (!page)
+                       goto set_status;
+
+               if (!is_zone_device_page(page))
                         err = page_to_nid(page);
-                       put_page(page);
-               } else {
-                       err = -ENOENT;
-               }
+
+               put_page(page);
  set_status:
                 *status = err;
  
@@ -2049,13 +2491,12 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
         return false;
  }
  
-static struct page *alloc_misplaced_dst_page(struct page *page,
+static struct folio *alloc_misplaced_dst_folio(struct folio *src,
                                            unsigned long data)
  {
         int nid = (int) data;
-       int order = compound_order(page);
+       int order = folio_order(src);
         gfp_t gfp = __GFP_THISNODE;
-       struct folio *new;
  
         if (order > 0)
                 gfp |= GFP_TRANSHUGE_LIGHT;
@@ -2064,21 +2505,12 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
                         __GFP_NOWARN;
                 gfp &= ~__GFP_RECLAIM;
         }
-       new = __folio_alloc_node(gfp, order, nid);
-
-       return &new->page;
+       return __folio_alloc_node(gfp, order, nid);
  }
  
-static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
+static int numamigrate_isolate_folio(pg_data_t *pgdat, struct folio *folio)
  {
-       int nr_pages = thp_nr_pages(page);
-       int order = compound_order(page);
-
-       VM_BUG_ON_PAGE(order && !PageTransHuge(page), page);
-
-       /* Do not migrate THP mapped by multiple processes */
-       if (PageTransHuge(page) && total_mapcount(page) > 1)
-               return 0;
+       int nr_pages = folio_nr_pages(folio);
  
         /* Avoid migrating to a node that is nearly full */
         if (!migrate_balanced_pgdat(pgdat, nr_pages)) {
@@ -2090,75 +2522,87 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
                         if (managed_zone(pgdat->node_zones + z))
                                 break;
                 }
-               wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE);
+
+               /*
+                * If there are no managed zones, it should not proceed
+                * further.
+                */
+               if (z < 0)
+                       return 0;
+
+               wakeup_kswapd(pgdat->node_zones + z, 0,
+                             folio_order(folio), ZONE_MOVABLE);
                 return 0;
         }
  
-       if (isolate_lru_page(page))
+       if (!folio_isolate_lru(folio))
                 return 0;
  
-       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_is_file_lru(page),
+       node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio),
                             nr_pages);
  
         /*
-        * Isolating the page has taken another reference, so the
-        * caller's reference can be safely dropped without the page
+        * Isolating the folio has taken another reference, so the
+        * caller's reference can be safely dropped without the folio
          * disappearing underneath us during migration.
          */
-       put_page(page);
+       folio_put(folio);
         return 1;
  }
  
  /*
- * Attempt to migrate a misplaced page to the specified destination
+ * Attempt to migrate a misplaced folio to the specified destination
   * node. Caller is expected to have an elevated reference count on
- * the page that will be dropped by this function before returning.
+ * the folio that will be dropped by this function before returning.
   */
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
-                          int node)
+int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
+                           int node)
  {
         pg_data_t *pgdat = NODE_DATA(node);
         int isolated;
         int nr_remaining;
         unsigned int nr_succeeded;
         LIST_HEAD(migratepages);
-       int nr_pages = thp_nr_pages(page);
+       int nr_pages = folio_nr_pages(folio);
  
         /*
-        * Don't migrate file pages that are mapped in multiple processes
+        * Don't migrate file folios that are mapped in multiple processes
          * with execute permissions as they are probably shared libraries.
+        * To check if the folio is shared, ideally we want to make sure
+        * every page is mapped to the same process. Doing that is very
+        * expensive, so check the estimated mapcount of the folio instead.
          */
-       if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
+       if (folio_estimated_sharers(folio) != 1 && folio_is_file_lru(folio) &&
             (vma->vm_flags & VM_EXEC))
                 goto out;
  
         /*
-        * Also do not migrate dirty pages as not all filesystems can move
-        * dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
+        * Also do not migrate dirty folios as not all filesystems can move
+        * dirty folios in MIGRATE_ASYNC mode which is a waste of cycles.
          */
-       if (page_is_file_lru(page) && PageDirty(page))
+       if (folio_is_file_lru(folio) && folio_test_dirty(folio))
                 goto out;
  
-       isolated = numamigrate_isolate_page(pgdat, page);
+       isolated = numamigrate_isolate_folio(pgdat, folio);
         if (!isolated)
                 goto out;
  
-       list_add(&page->lru, &migratepages);
-       nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page,
+       list_add(&folio->lru, &migratepages);
+       nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
                                      NULL, node, MIGRATE_ASYNC,
                                      MR_NUMA_MISPLACED, &nr_succeeded);
         if (nr_remaining) {
                 if (!list_empty(&migratepages)) {
-                       list_del(&page->lru);
-                       mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
-                                       page_is_file_lru(page), -nr_pages);
-                       putback_lru_page(page);
+                       list_del(&folio->lru);
+                       node_stat_mod_folio(folio, NR_ISOLATED_ANON +
+                                       folio_is_file_lru(folio), -nr_pages);
+                       folio_putback_lru(folio);
                 }
                 isolated = 0;
         }
         if (nr_succeeded) {
                 count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
-               if (!node_is_toptier(page_to_nid(page)) && node_is_toptier(node))
+               if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node))
                         mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
                                             nr_succeeded);
         }
@@ -2166,460 +2610,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
         return isolated;
  
  out:
-       put_page(page);
+       folio_put(folio);
         return 0;
  }
  #endif /* CONFIG_NUMA_BALANCING */
-
-/*
- * node_demotion[] example:
- *
- * Consider a system with two sockets.  Each socket has
- * three classes of memory attached: fast, medium and slow.
- * Each memory class is placed in its own NUMA node.  The
- * CPUs are placed in the node with the "fast" memory.  The
- * 6 NUMA nodes (0-5) might be split among the sockets like
- * this:
- *
- *     Socket A: 0, 1, 2
- *     Socket B: 3, 4, 5
- *
- * When Node 0 fills up, its memory should be migrated to
- * Node 1.  When Node 1 fills up, it should be migrated to
- * Node 2.  The migration path start on the nodes with the
- * processors (since allocations default to this node) and
- * fast memory, progress through medium and end with the
- * slow memory:
- *
- *     0 -> 1 -> 2 -> stop
- *     3 -> 4 -> 5 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *
- *     {  nr=1, nodes[0]=1 }, // Node 0 migrates to 1
- *     {  nr=1, nodes[0]=2 }, // Node 1 migrates to 2
- *     {  nr=0, nodes[0]=-1 }, // Node 2 does not migrate
- *     {  nr=1, nodes[0]=4 }, // Node 3 migrates to 4
- *     {  nr=1, nodes[0]=5 }, // Node 4 migrates to 5
- *     {  nr=0, nodes[0]=-1 }, // Node 5 does not migrate
- *
- * Moreover some systems may have multiple slow memory nodes.
- * Suppose a system has one socket with 3 memory nodes, node 0
- * is fast memory type, and node 1/2 both are slow memory
- * type, and the distance between fast memory node and slow
- * memory node is same. So the migration path should be:
- *
- *     0 -> 1/2 -> stop
- *
- * This is represented in the node_demotion[] like this:
- *     { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
- *     { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
- *     { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
- */
-
-/*
- * Writes to this array occur without locking.  Cycles are
- * not allowed: Node X demotes to Y which demotes to X...
- *
- * If multiple reads are performed, a single rcu_read_lock()
- * must be held over all reads to ensure that no cycles are
- * observed.
- */
-#define DEFAULT_DEMOTION_TARGET_NODES 15
-
-#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
-#define DEMOTION_TARGET_NODES  (MAX_NUMNODES - 1)
-#else
-#define DEMOTION_TARGET_NODES  DEFAULT_DEMOTION_TARGET_NODES
-#endif
-
-struct demotion_nodes {
-       unsigned short nr;
-       short nodes[DEMOTION_TARGET_NODES];
-};
-
-static struct demotion_nodes *node_demotion __read_mostly;
-
-/**
- * next_demotion_node() - Get the next node in the demotion path
- * @node: The starting node to lookup the next node
- *
- * Return: node id for next memory node in the demotion path hierarchy
- * from @node; NUMA_NO_NODE if @node is terminal.  This does not keep
- * @node online or guarantee that it *continues* to be the next demotion
- * target.
- */
-int next_demotion_node(int node)
-{
-       struct demotion_nodes *nd;
-       unsigned short target_nr, index;
-       int target;
-
-       if (!node_demotion)
-               return NUMA_NO_NODE;
-
-       nd = &node_demotion[node];
-
-       /*
-        * node_demotion[] is updated without excluding this
-        * function from running.  RCU doesn't provide any
-        * compiler barriers, so the READ_ONCE() is required
-        * to avoid compiler reordering or read merging.
-        *
-        * Make sure to use RCU over entire code blocks if
-        * node_demotion[] reads need to be consistent.
-        */
-       rcu_read_lock();
-       target_nr = READ_ONCE(nd->nr);
-
-       switch (target_nr) {
-       case 0:
-               target = NUMA_NO_NODE;
-               goto out;
-       case 1:
-               index = 0;
-               break;
-       default:
-               /*
-                * If there are multiple target nodes, just select one
-                * target node randomly.
-                *
-                * In addition, we can also use round-robin to select
-                * target node, but we should introduce another variable
-                * for node_demotion[] to record last selected target node,
-                * that may cause cache ping-pong due to the changing of
-                * last target node. Or introducing per-cpu data to avoid
-                * caching issue, which seems more complicated. So selecting
-                * target node randomly seems better until now.
-                */
-               index = get_random_int() % target_nr;
-               break;
-       }
-
-       target = READ_ONCE(nd->nodes[index]);
-
-out:
-       rcu_read_unlock();
-       return target;
-}
-
-/* Disable reclaim-based migration. */
-static void __disable_all_migrate_targets(void)
-{
-       int node, i;
-
-       if (!node_demotion)
-               return;
-
-       for_each_online_node(node) {
-               node_demotion[node].nr = 0;
-               for (i = 0; i < DEMOTION_TARGET_NODES; i++)
-                       node_demotion[node].nodes[i] = NUMA_NO_NODE;
-       }
-}
-
-static void disable_all_migrate_targets(void)
-{
-       __disable_all_migrate_targets();
-
-       /*
-        * Ensure that the "disable" is visible across the system.
-        * Readers will see either a combination of before+disable
-        * state or disable+after.  They will never see before and
-        * after state together.
-        *
-        * The before+after state together might have cycles and
-        * could cause readers to do things like loop until this
-        * function finishes.  This ensures they can only see a
-        * single "bad" read and would, for instance, only loop
-        * once.
-        */
-       synchronize_rcu();
-}
-
-/*
- * Find an automatic demotion target for 'node'.
- * Failing here is OK.  It might just indicate
- * being at the end of a chain.
- */
-static int establish_migrate_target(int node, nodemask_t *used,
-                                   int best_distance)
-{
-       int migration_target, index, val;
-       struct demotion_nodes *nd;
-
-       if (!node_demotion)
-               return NUMA_NO_NODE;
-
-       nd = &node_demotion[node];
-
-       migration_target = find_next_best_node(node, used);
-       if (migration_target == NUMA_NO_NODE)
-               return NUMA_NO_NODE;
-
-       /*
-        * If the node has been set a migration target node before,
-        * which means it's the best distance between them. Still
-        * check if this node can be demoted to other target nodes
-        * if they have a same best distance.
-        */
-       if (best_distance != -1) {
-               val = node_distance(node, migration_target);
-               if (val > best_distance)
-                       goto out_clear;
-       }
-
-       index = nd->nr;
-       if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
-                     "Exceeds maximum demotion target nodes\n"))
-               goto out_clear;
-
-       nd->nodes[index] = migration_target;
-       nd->nr++;
-
-       return migration_target;
-out_clear:
-       node_clear(migration_target, *used);
-       return NUMA_NO_NODE;
-}
-
-/*
- * When memory fills up on a node, memory contents can be
- * automatically migrated to another node instead of
- * discarded at reclaim.
- *
- * Establish a "migration path" which will start at nodes
- * with CPUs and will follow the priorities used to build the
- * page allocator zonelists.
- *
- * The difference here is that cycles must be avoided.  If
- * node0 migrates to node1, then neither node1, nor anything
- * node1 migrates to can migrate to node0. Also one node can
- * be migrated to multiple nodes if the target nodes all have
- * a same best-distance against the source node.
- *
- * This function can run simultaneously with readers of
- * node_demotion[].  However, it can not run simultaneously
- * with itself.  Exclusion is provided by memory hotplug events
- * being single-threaded.
- */
-static void __set_migration_target_nodes(void)
-{
-       nodemask_t next_pass;
-       nodemask_t this_pass;
-       nodemask_t used_targets = NODE_MASK_NONE;
-       int node, best_distance;
-
-       /*
-        * Avoid any oddities like cycles that could occur
-        * from changes in the topology.  This will leave
-        * a momentary gap when migration is disabled.
-        */
-       disable_all_migrate_targets();
-
-       /*
-        * Allocations go close to CPUs, first.  Assume that
-        * the migration path starts at the nodes with CPUs.
-        */
-       next_pass = node_states[N_CPU];
-again:
-       this_pass = next_pass;
-       next_pass = NODE_MASK_NONE;
-       /*
-        * To avoid cycles in the migration "graph", ensure
-        * that migration sources are not future targets by
-        * setting them in 'used_targets'.  Do this only
-        * once per pass so that multiple source nodes can
-        * share a target node.
-        *
-        * 'used_targets' will become unavailable in future
-        * passes.  This limits some opportunities for
-        * multiple source nodes to share a destination.
-        */
-       nodes_or(used_targets, used_targets, this_pass);
-
-       for_each_node_mask(node, this_pass) {
-               best_distance = -1;
-
-               /*
-                * Try to set up the migration path for the node, and the target
-                * migration nodes can be multiple, so doing a loop to find all
-                * the target nodes if they all have a best node distance.
-                */
-               do {
-                       int target_node =
-                               establish_migrate_target(node, &used_targets,
-                                                        best_distance);
-
-                       if (target_node == NUMA_NO_NODE)
-                               break;
-
-                       if (best_distance == -1)
-                               best_distance = node_distance(node, target_node);
-
-                       /*
-                        * Visit targets from this pass in the next pass.
-                        * Eventually, every node will have been part of
-                        * a pass, and will become set in 'used_targets'.
-                        */
-                       node_set(target_node, next_pass);
-               } while (1);
-       }
-       /*
-        * 'next_pass' contains nodes which became migration
-        * targets in this pass.  Make additional passes until
-        * no more migrations targets are available.
-        */
-       if (!nodes_empty(next_pass))
-               goto again;
-}
-
-/*
- * For callers that do not hold get_online_mems() already.
- */
-void set_migration_target_nodes(void)
-{
-       get_online_mems();
-       __set_migration_target_nodes();
-       put_online_mems();
-}
-
-/*
- * This leaves migrate-on-reclaim transiently disabled between
- * the MEM_GOING_OFFLINE and MEM_OFFLINE events.  This runs
- * whether reclaim-based migration is enabled or not, which
- * ensures that the user can turn reclaim-based migration at
- * any time without needing to recalculate migration targets.
- *
- * These callbacks already hold get_online_mems().  That is why
- * __set_migration_target_nodes() can be used as opposed to
- * set_migration_target_nodes().
- */
-#ifdef CONFIG_MEMORY_HOTPLUG
-static int __meminit migrate_on_reclaim_callback(struct notifier_block *self,
-                                                unsigned long action, void *_arg)
-{
-       struct memory_notify *arg = _arg;
-
-       /*
-        * Only update the node migration order when a node is
-        * changing status, like online->offline.  This avoids
-        * the overhead of synchronize_rcu() in most cases.
-        */
-       if (arg->status_change_nid < 0)
-               return notifier_from_errno(0);
-
-       switch (action) {
-       case MEM_GOING_OFFLINE:
-               /*
-                * Make sure there are not transient states where
-                * an offline node is a migration target.  This
-                * will leave migration disabled until the offline
-                * completes and the MEM_OFFLINE case below runs.
-                */
-               disable_all_migrate_targets();
-               break;
-       case MEM_OFFLINE:
-       case MEM_ONLINE:
-               /*
-                * Recalculate the target nodes once the node
-                * reaches its final state (online or offline).
-                */
-               __set_migration_target_nodes();
-               break;
-       case MEM_CANCEL_OFFLINE:
-               /*
-                * MEM_GOING_OFFLINE disabled all the migration
-                * targets.  Reenable them.
-                */
-               __set_migration_target_nodes();
-               break;
-       case MEM_GOING_ONLINE:
-       case MEM_CANCEL_ONLINE:
-               break;
-       }
-
-       return notifier_from_errno(0);
-}
-#endif
-
-void __init migrate_on_reclaim_init(void)
-{
-       node_demotion = kcalloc(nr_node_ids,
-                               sizeof(struct demotion_nodes),
-                               GFP_KERNEL);
-       WARN_ON(!node_demotion);
-#ifdef CONFIG_MEMORY_HOTPLUG
-       hotplug_memory_notifier(migrate_on_reclaim_callback, 100);
-#endif
-       /*
-        * At this point, all numa nodes with memory/CPus have their state
-        * properly set, so we can build the demotion order now.
-        * Let us hold the cpu_hotplug lock just, as we could possibily have
-        * CPU hotplug events during boot.
-        */
-       cpus_read_lock();
-       set_migration_target_nodes();
-       cpus_read_unlock();
-}
-
-bool numa_demotion_enabled = false;
-
-#ifdef CONFIG_SYSFS
-static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
-                                         struct kobj_attribute *attr, char *buf)
-{
-       return sysfs_emit(buf, "%s\n",
-                         numa_demotion_enabled ? "true" : "false");
-}
-
-static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
-                                          struct kobj_attribute *attr,
-                                          const char *buf, size_t count)
-{
-       ssize_t ret;
-
-       ret = kstrtobool(buf, &numa_demotion_enabled);
-       if (ret)
-               return ret;
-
-       return count;
-}
-
-static struct kobj_attribute numa_demotion_enabled_attr =
-       __ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
-              numa_demotion_enabled_store);
-
-static struct attribute *numa_attrs[] = {
-       &numa_demotion_enabled_attr.attr,
-       NULL,
-};
-
-static const struct attribute_group numa_attr_group = {
-       .attrs = numa_attrs,
-};
-
-static int __init numa_init_sysfs(void)
-{
-       int err;
-       struct kobject *numa_kobj;
-
-       numa_kobj = kobject_create_and_add("numa", mm_kobj);
-       if (!numa_kobj) {
-               pr_err("failed to create numa kobject\n");
-               return -ENOMEM;
-       }
-       err = sysfs_create_group(numa_kobj, &numa_attr_group);
-       if (err) {
-               pr_err("failed to register numa group\n");
-               goto delete_obj;
-       }
-       return 0;
-
-delete_obj:
-       kobject_put(numa_kobj);
-       return err;
-}
-subsys_initcall(numa_init_sysfs);
-#endif /* CONFIG_SYSFS */
  #endif /* CONFIG_NUMA */