Linux 6.9-rc1

[linux-2.6-microblaze.git] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 0bfc8b0..f2bc6dd 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -806,9 +806,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 }
                 rss[MM_SWAPENTS]++;
         } else if (is_migration_entry(entry)) {
-               page = pfn_swap_entry_to_page(entry);
+               folio = pfn_swap_entry_folio(entry);
  
-               rss[mm_counter(page)]++;
+               rss[mm_counter(folio)]++;
  
                 if (!is_readable_migration_entry(entry) &&
                                 is_cow_mapping(vm_flags)) {
@@ -840,7 +840,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  * keep things as they are.
                  */
                 folio_get(folio);
-               rss[mm_counter(page)]++;
+               rss[mm_counter(folio)]++;
                 /* Cannot fail as these pages cannot get pinned. */
                 folio_try_dup_anon_rmap_pte(folio, page, src_vma);
  
@@ -930,68 +930,111 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
         return 0;
  }
  
+static __always_inline void __copy_present_ptes(struct vm_area_struct *dst_vma,
+               struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte,
+               pte_t pte, unsigned long addr, int nr)
+{
+       struct mm_struct *src_mm = src_vma->vm_mm;
+
+       /* If it's a COW mapping, write protect it both processes. */
+       if (is_cow_mapping(src_vma->vm_flags) && pte_write(pte)) {
+               wrprotect_ptes(src_mm, addr, src_pte, nr);
+               pte = pte_wrprotect(pte);
+       }
+
+       /* If it's a shared mapping, mark it clean in the child. */
+       if (src_vma->vm_flags & VM_SHARED)
+               pte = pte_mkclean(pte);
+       pte = pte_mkold(pte);
+
+       if (!userfaultfd_wp(dst_vma))
+               pte = pte_clear_uffd_wp(pte);
+
+       set_ptes(dst_vma->vm_mm, addr, dst_pte, pte, nr);
+}
+
  /*
- * Copy one pte.  Returns 0 if succeeded, or -EAGAIN if one preallocated page
- * is required to copy this pte.
+ * Copy one present PTE, trying to batch-process subsequent PTEs that map
+ * consecutive pages of the same folio by copying them as well.
+ *
+ * Returns -EAGAIN if one preallocated page is required to copy the next PTE.
+ * Otherwise, returns the number of copied PTEs (at least 1).
   */
  static inline int
-copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
-                pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
-                struct folio **prealloc)
+copy_present_ptes(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
+                pte_t *dst_pte, pte_t *src_pte, pte_t pte, unsigned long addr,
+                int max_nr, int *rss, struct folio **prealloc)
  {
-       struct mm_struct *src_mm = src_vma->vm_mm;
-       unsigned long vm_flags = src_vma->vm_flags;
-       pte_t pte = ptep_get(src_pte);
         struct page *page;
         struct folio *folio;
+       bool any_writable;
+       fpb_t flags = 0;
+       int err, nr;
  
         page = vm_normal_page(src_vma, addr, pte);
-       if (page)
-               folio = page_folio(page);
-       if (page && folio_test_anon(folio)) {
+       if (unlikely(!page))
+               goto copy_pte;
+
+       folio = page_folio(page);
+
+       /*
+        * If we likely have to copy, just don't bother with batching. Make
+        * sure that the common "small folio" case is as fast as possible
+        * by keeping the batching logic separate.
+        */
+       if (unlikely(!*prealloc && folio_test_large(folio) && max_nr != 1)) {
+               if (src_vma->vm_flags & VM_SHARED)
+                       flags |= FPB_IGNORE_DIRTY;
+               if (!vma_soft_dirty_enabled(src_vma))
+                       flags |= FPB_IGNORE_SOFT_DIRTY;
+
+               nr = folio_pte_batch(folio, addr, src_pte, pte, max_nr, flags,
+                                    &any_writable);
+               folio_ref_add(folio, nr);
+               if (folio_test_anon(folio)) {
+                       if (unlikely(folio_try_dup_anon_rmap_ptes(folio, page,
+                                                                 nr, src_vma))) {
+                               folio_ref_sub(folio, nr);
+                               return -EAGAIN;
+                       }
+                       rss[MM_ANONPAGES] += nr;
+                       VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+               } else {
+                       folio_dup_file_rmap_ptes(folio, page, nr);
+                       rss[mm_counter_file(folio)] += nr;
+               }
+               if (any_writable)
+                       pte = pte_mkwrite(pte, src_vma);
+               __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte,
+                                   addr, nr);
+               return nr;
+       }
+
+       folio_get(folio);
+       if (folio_test_anon(folio)) {
                 /*
                  * If this page may have been pinned by the parent process,
                  * copy the page immediately for the child so that we'll always
                  * guarantee the pinned page won't be randomly replaced in the
                  * future.
                  */
-               folio_get(folio);
                 if (unlikely(folio_try_dup_anon_rmap_pte(folio, page, src_vma))) {
                         /* Page may be pinned, we have to copy. */
                         folio_put(folio);
-                       return copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
-                                                addr, rss, prealloc, page);
+                       err = copy_present_page(dst_vma, src_vma, dst_pte, src_pte,
+                                               addr, rss, prealloc, page);
+                       return err ? err : 1;
                 }
                 rss[MM_ANONPAGES]++;
-       } else if (page) {
-               folio_get(folio);
+               VM_WARN_ON_FOLIO(PageAnonExclusive(page), folio);
+       } else {
                 folio_dup_file_rmap_pte(folio, page);
-               rss[mm_counter_file(page)]++;
-       }
-
-       /*
-        * If it's a COW mapping, write protect it both
-        * in the parent and the child
-        */
-       if (is_cow_mapping(vm_flags) && pte_write(pte)) {
-               ptep_set_wrprotect(src_mm, addr, src_pte);
-               pte = pte_wrprotect(pte);
+               rss[mm_counter_file(folio)]++;
         }
-       VM_BUG_ON(page && folio_test_anon(folio) && PageAnonExclusive(page));
  
-       /*
-        * If it's a shared mapping, mark it clean in
-        * the child
-        */
-       if (vm_flags & VM_SHARED)
-               pte = pte_mkclean(pte);
-       pte = pte_mkold(pte);
-
-       if (!userfaultfd_wp(dst_vma))
-               pte = pte_clear_uffd_wp(pte);
-
-       set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
-       return 0;
+copy_pte:
+       __copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte, pte, addr, 1);
+       return 1;
  }
  
  static inline struct folio *folio_prealloc(struct mm_struct *src_mm,
@@ -1028,10 +1071,11 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
         pte_t *src_pte, *dst_pte;
         pte_t ptent;
         spinlock_t *src_ptl, *dst_ptl;
-       int progress, ret = 0;
+       int progress, max_nr, ret = 0;
         int rss[NR_MM_COUNTERS];
         swp_entry_t entry = (swp_entry_t){0};
         struct folio *prealloc = NULL;
+       int nr;
  
  again:
         progress = 0;
@@ -1062,6 +1106,8 @@ again:
         arch_enter_lazy_mmu_mode();
  
         do {
+               nr = 1;
+
                 /*
                  * We are holding two locks at this point - either of them
                  * could generate latencies in another task on another CPU.
@@ -1091,6 +1137,8 @@ again:
                                 progress += 8;
                                 continue;
                         }
+                       ptent = ptep_get(src_pte);
+                       VM_WARN_ON_ONCE(!pte_present(ptent));
  
                         /*
                          * Device exclusive entry restored, continue by copying
@@ -1098,9 +1146,10 @@ again:
                          */
                         WARN_ON_ONCE(ret != -ENOENT);
                 }
-               /* copy_present_pte() will clear `*prealloc' if consumed */
-               ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
-                                      addr, rss, &prealloc);
+               /* copy_present_ptes() will clear `*prealloc' if consumed */
+               max_nr = (end - addr) / PAGE_SIZE;
+               ret = copy_present_ptes(dst_vma, src_vma, dst_pte, src_pte,
+                                       ptent, addr, max_nr, rss, &prealloc);
                 /*
                  * If we need a pre-allocated page for this pte, drop the
                  * locks, allocate, and try again.
@@ -1117,8 +1166,10 @@ again:
                         folio_put(prealloc);
                         prealloc = NULL;
                 }
-               progress += 8;
-       } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
+               nr = ret;
+               progress += 8 * nr;
+       } while (dst_pte += nr, src_pte += nr, addr += PAGE_SIZE * nr,
+                addr != end);
  
         arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(orig_src_pte, src_ptl);
@@ -1139,7 +1190,7 @@ again:
                 prealloc = folio_prealloc(src_mm, src_vma, addr, false);
                 if (!prealloc)
                         return -ENOMEM;
-       } else if (ret) {
+       } else if (ret < 0) {
                 VM_WARN_ON_ONCE(1);
         }
  
@@ -1369,19 +1420,16 @@ static inline bool should_zap_cows(struct zap_details *details)
         return details->even_cows;
  }
  
-/* Decides whether we should zap this page with the page pointer specified */
-static inline bool should_zap_page(struct zap_details *details, struct page *page)
+/* Decides whether we should zap this folio with the folio pointer specified */
+static inline bool should_zap_folio(struct zap_details *details,
+                                   struct folio *folio)
  {
-       /* If we can make a decision without *page.. */
+       /* If we can make a decision without *folio.. */
         if (should_zap_cows(details))
                 return true;
  
-       /* E.g. the caller passes NULL for the case of a zero page */
-       if (!page)
-               return true;
-
-       /* Otherwise we should only zap non-anon pages */
-       return !PageAnon(page);
+       /* Otherwise we should only zap non-anon folios */
+       return !folio_test_anon(folio);
  }
  
  static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
@@ -1398,7 +1446,7 @@ static inline bool zap_drop_file_uffd_wp(struct zap_details *details)
   */
  static inline void
  zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
-                             unsigned long addr, pte_t *pte,
+                             unsigned long addr, pte_t *pte, int nr,
                               struct zap_details *details, pte_t pteval)
  {
         /* Zap on anonymous always means dropping everything */
@@ -1408,7 +1456,111 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma,
         if (zap_drop_file_uffd_wp(details))
                 return;
  
-       pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+       for (;;) {
+               /* the PFN in the PTE is irrelevant. */
+               pte_install_uffd_wp_if_needed(vma, addr, pte, pteval);
+               if (--nr == 0)
+                       break;
+               pte++;
+               addr += PAGE_SIZE;
+       }
+}
+
+static __always_inline void zap_present_folio_ptes(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, struct folio *folio,
+               struct page *page, pte_t *pte, pte_t ptent, unsigned int nr,
+               unsigned long addr, struct zap_details *details, int *rss,
+               bool *force_flush, bool *force_break)
+{
+       struct mm_struct *mm = tlb->mm;
+       bool delay_rmap = false;
+
+       if (!folio_test_anon(folio)) {
+               ptent = get_and_clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+               if (pte_dirty(ptent)) {
+                       folio_mark_dirty(folio);
+                       if (tlb_delay_rmap(tlb)) {
+                               delay_rmap = true;
+                               *force_flush = true;
+                       }
+               }
+               if (pte_young(ptent) && likely(vma_has_recency(vma)))
+                       folio_mark_accessed(folio);
+               rss[mm_counter(folio)] -= nr;
+       } else {
+               /* We don't need up-to-date accessed/dirty bits. */
+               clear_full_ptes(mm, addr, pte, nr, tlb->fullmm);
+               rss[MM_ANONPAGES] -= nr;
+       }
+       /* Checking a single PTE in a batch is sufficient. */
+       arch_check_zapped_pte(vma, ptent);
+       tlb_remove_tlb_entries(tlb, pte, nr, addr);
+       if (unlikely(userfaultfd_pte_wp(vma, ptent)))
+               zap_install_uffd_wp_if_needed(vma, addr, pte, nr, details,
+                                             ptent);
+
+       if (!delay_rmap) {
+               folio_remove_rmap_ptes(folio, page, nr, vma);
+
+               /* Only sanity-check the first page in a batch. */
+               if (unlikely(page_mapcount(page) < 0))
+                       print_bad_pte(vma, addr, ptent, page);
+       }
+       if (unlikely(__tlb_remove_folio_pages(tlb, page, nr, delay_rmap))) {
+               *force_flush = true;
+               *force_break = true;
+       }
+}
+
+/*
+ * Zap or skip at least one present PTE, trying to batch-process subsequent
+ * PTEs that map consecutive pages of the same folio.
+ *
+ * Returns the number of processed (skipped or zapped) PTEs (at least 1).
+ */
+static inline int zap_present_ptes(struct mmu_gather *tlb,
+               struct vm_area_struct *vma, pte_t *pte, pte_t ptent,
+               unsigned int max_nr, unsigned long addr,
+               struct zap_details *details, int *rss, bool *force_flush,
+               bool *force_break)
+{
+       const fpb_t fpb_flags = FPB_IGNORE_DIRTY | FPB_IGNORE_SOFT_DIRTY;
+       struct mm_struct *mm = tlb->mm;
+       struct folio *folio;
+       struct page *page;
+       int nr;
+
+       page = vm_normal_page(vma, addr, ptent);
+       if (!page) {
+               /* We don't need up-to-date accessed/dirty bits. */
+               ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm);
+               arch_check_zapped_pte(vma, ptent);
+               tlb_remove_tlb_entry(tlb, pte, addr);
+               VM_WARN_ON_ONCE(userfaultfd_wp(vma));
+               ksm_might_unmap_zero_page(mm, ptent);
+               return 1;
+       }
+
+       folio = page_folio(page);
+       if (unlikely(!should_zap_folio(details, folio)))
+               return 1;
+
+       /*
+        * Make sure that the common "small folio" case is as fast as possible
+        * by keeping the batching logic separate.
+        */
+       if (unlikely(folio_test_large(folio) && max_nr != 1)) {
+               nr = folio_pte_batch(folio, addr, pte, ptent, max_nr, fpb_flags,
+                                    NULL);
+
+               zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, nr,
+                                      addr, details, rss, force_flush,
+                                      force_break);
+               return nr;
+       }
+       zap_present_folio_ptes(tlb, vma, folio, page, pte, ptent, 1, addr,
+                              details, rss, force_flush, force_break);
+       return 1;
  }
  
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
@@ -1416,13 +1568,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
+       bool force_flush = false, force_break = false;
         struct mm_struct *mm = tlb->mm;
-       int force_flush = 0;
         int rss[NR_MM_COUNTERS];
         spinlock_t *ptl;
         pte_t *start_pte;
         pte_t *pte;
         swp_entry_t entry;
+       int nr;
  
         tlb_change_page_size(tlb, PAGE_SIZE);
         init_rss_vec(rss);
@@ -1436,7 +1589,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                 pte_t ptent = ptep_get(pte);
                 struct folio *folio;
                 struct page *page;
+               int max_nr;
  
+               nr = 1;
                 if (pte_none(ptent))
                         continue;
  
@@ -1444,44 +1599,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         break;
  
                 if (pte_present(ptent)) {
-                       unsigned int delay_rmap;
-
-                       page = vm_normal_page(vma, addr, ptent);
-                       if (unlikely(!should_zap_page(details, page)))
-                               continue;
-                       ptent = ptep_get_and_clear_full(mm, addr, pte,
-                                                       tlb->fullmm);
-                       arch_check_zapped_pte(vma, ptent);
-                       tlb_remove_tlb_entry(tlb, pte, addr);
-                       zap_install_uffd_wp_if_needed(vma, addr, pte, details,
-                                                     ptent);
-                       if (unlikely(!page)) {
-                               ksm_might_unmap_zero_page(mm, ptent);
-                               continue;
-                       }
-
-                       folio = page_folio(page);
-                       delay_rmap = 0;
-                       if (!folio_test_anon(folio)) {
-                               if (pte_dirty(ptent)) {
-                                       folio_mark_dirty(folio);
-                                       if (tlb_delay_rmap(tlb)) {
-                                               delay_rmap = 1;
-                                               force_flush = 1;
-                                       }
-                               }
-                               if (pte_young(ptent) && likely(vma_has_recency(vma)))
-                                       folio_mark_accessed(folio);
-                       }
-                       rss[mm_counter(page)]--;
-                       if (!delay_rmap) {
-                               folio_remove_rmap_pte(folio, page, vma);
-                               if (unlikely(page_mapcount(page) < 0))
-                                       print_bad_pte(vma, addr, ptent, page);
-                       }
-                       if (unlikely(__tlb_remove_page(tlb, page, delay_rmap))) {
-                               force_flush = 1;
-                               addr += PAGE_SIZE;
+                       max_nr = (end - addr) / PAGE_SIZE;
+                       nr = zap_present_ptes(tlb, vma, pte, ptent, max_nr,
+                                             addr, details, rss, &force_flush,
+                                             &force_break);
+                       if (unlikely(force_break)) {
+                               addr += nr * PAGE_SIZE;
                                 break;
                         }
                         continue;
@@ -1492,7 +1615,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                     is_device_exclusive_entry(entry)) {
                         page = pfn_swap_entry_to_page(entry);
                         folio = page_folio(page);
-                       if (unlikely(!should_zap_page(details, page)))
+                       if (unlikely(!should_zap_folio(details, folio)))
                                 continue;
                         /*
                          * Both device private/exclusive mappings should only
@@ -1501,7 +1624,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                          * see zap_install_uffd_wp_if_needed().
                          */
                         WARN_ON_ONCE(!vma_is_anonymous(vma));
-                       rss[mm_counter(page)]--;
+                       rss[mm_counter(folio)]--;
                         if (is_device_private_entry(entry))
                                 folio_remove_rmap_pte(folio, page, vma);
                         folio_put(folio);
@@ -1513,10 +1636,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         if (unlikely(!free_swap_and_cache(entry)))
                                 print_bad_pte(vma, addr, ptent, NULL);
                 } else if (is_migration_entry(entry)) {
-                       page = pfn_swap_entry_to_page(entry);
-                       if (!should_zap_page(details, page))
+                       folio = pfn_swap_entry_folio(entry);
+                       if (!should_zap_folio(details, folio))
                                 continue;
-                       rss[mm_counter(page)]--;
+                       rss[mm_counter(folio)]--;
                 } else if (pte_marker_entry_uffd_wp(entry)) {
                         /*
                          * For anon: always drop the marker; for file: only
@@ -1535,8 +1658,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         WARN_ON_ONCE(1);
                 }
                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-               zap_install_uffd_wp_if_needed(vma, addr, pte, details, ptent);
-       } while (pte++, addr += PAGE_SIZE, addr != end);
+               zap_install_uffd_wp_if_needed(vma, addr, pte, 1, details, ptent);
+       } while (pte += nr, addr += PAGE_SIZE * nr, addr != end);
  
         add_mm_rss_vec(mm, rss);
         arch_leave_lazy_mmu_mode();
@@ -1870,7 +1993,7 @@ static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                 return -EBUSY;
         /* Ok, finally just insert the thing.. */
         folio_get(folio);
-       inc_mm_counter(vma->vm_mm, mm_counter_file(page));
+       inc_mm_counter(vma->vm_mm, mm_counter_file(folio));
         folio_add_file_rmap_pte(folio, page, vma);
         set_pte_at(vma->vm_mm, addr, pte, mk_pte(page, prot));
         return 0;
@@ -3081,7 +3204,7 @@ static inline vm_fault_t vmf_can_call_fault(const struct vm_fault *vmf)
         return VM_FAULT_RETRY;
  }
  
-static vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
+vm_fault_t vmf_anon_prepare(struct vm_fault *vmf)
  {
         struct vm_area_struct *vma = vmf->vma;
  
@@ -3175,7 +3298,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
         if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                 if (old_folio) {
                         if (!folio_test_anon(old_folio)) {
-                               dec_mm_counter(mm, mm_counter_file(&old_folio->page));
+                               dec_mm_counter(mm, mm_counter_file(old_folio));
                                 inc_mm_counter(mm, MM_ANONPAGES);
                         }
                 } else {
@@ -3253,7 +3376,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                 folio_put(new_folio);
         if (old_folio) {
                 if (page_copied)
-                       free_swap_cache(&old_folio->page);
+                       free_swap_cache(old_folio);
                 folio_put(old_folio);
         }
  
@@ -3375,6 +3498,16 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf, struct folio *folio)
  static bool wp_can_reuse_anon_folio(struct folio *folio,
                                     struct vm_area_struct *vma)
  {
+       /*
+        * We could currently only reuse a subpage of a large folio if no
+        * other subpages of the large folios are still mapped. However,
+        * let's just consistently not reuse subpages even if we could
+        * reuse in that scenario, and give back a large folio a bit
+        * sooner.
+        */
+       if (folio_test_large(folio))
+               return false;
+
         /*
          * We have to verify under folio lock: these early checks are
          * just an optimization to avoid locking the folio and freeing
@@ -4170,8 +4303,8 @@ static bool pte_range_none(pte_t *pte, int nr_pages)
  
  static struct folio *alloc_anon_folio(struct vm_fault *vmf)
  {
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
         struct vm_area_struct *vma = vmf->vma;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
         unsigned long orders;
         struct folio *folio;
         unsigned long addr;
@@ -4223,15 +4356,21 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf)
                 addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
                 folio = vma_alloc_folio(gfp, order, vma, addr, true);
                 if (folio) {
+                       if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) {
+                               folio_put(folio);
+                               goto next;
+                       }
+                       folio_throttle_swaprate(folio, gfp);
                         clear_huge_page(&folio->page, vmf->address, 1 << order);
                         return folio;
                 }
+next:
                 order = next_order(&orders, order);
         }
  
  fallback:
  #endif
-       return vma_alloc_zeroed_movable_folio(vmf->vma, vmf->address);
+       return folio_prealloc(vma->vm_mm, vma, vmf->address, true);
  }
  
  /*
@@ -4298,10 +4437,6 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
         nr_pages = folio_nr_pages(folio);
         addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
  
-       if (mem_cgroup_charge(folio, vma->vm_mm, GFP_KERNEL))
-               goto oom_free_page;
-       folio_throttle_swaprate(folio, GFP_KERNEL);
-
         /*
          * The memory barrier inside __folio_mark_uptodate makes sure that
          * preceding stores to the page contents become visible before
@@ -4355,8 +4490,6 @@ unlock:
  release:
         folio_put(folio);
         goto unlock;
-oom_free_page:
-       folio_put(folio);
  oom:
         return VM_FAULT_OOM;
  }
@@ -4480,7 +4613,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
         if (write)
                 entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
  
-       add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR);
+       add_mm_counter(vma->vm_mm, mm_counter_file(folio), HPAGE_PMD_NR);
         folio_add_file_rmap_pmd(folio, page, vma);
  
         /*
@@ -4543,7 +4676,7 @@ void set_pte_range(struct vm_fault *vmf, struct folio *folio,
                 folio_add_new_anon_rmap(folio, vma, addr);
                 folio_add_lru_vma(folio, vma);
         } else {
-               add_mm_counter(vma->vm_mm, mm_counter_file(page), nr);
+               add_mm_counter(vma->vm_mm, mm_counter_file(folio), nr);
                 folio_add_file_rmap_ptes(folio, page, nr, vma);
         }
         set_ptes(vma->vm_mm, addr, vmf->pte, entry, nr);
@@ -4653,7 +4786,8 @@ static int fault_around_bytes_set(void *data, u64 val)
          * The minimum value is 1 page, however this results in no fault-around
          * at all. See should_fault_around().
          */
-       fault_around_pages = max(rounddown_pow_of_two(val) >> PAGE_SHIFT, 1UL);
+       val = max(val, PAGE_SIZE);
+       fault_around_pages = rounddown_pow_of_two(val) >> PAGE_SHIFT;
  
         return 0;
  }
@@ -4928,18 +5062,18 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         int flags = 0;
  
         /*
-        * The "pte" at this point cannot be used safely without
-        * validation through pte_unmap_same(). It's of NUMA type but
-        * the pfn may be screwed if the read is non atomic.
+        * The pte cannot be used safely until we verify, while holding the page
+        * table lock, that its contents have not changed during fault handling.
          */
         spin_lock(vmf->ptl);
-       if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+       /* Read the live PTE from the page tables: */
+       old_pte = ptep_get(vmf->pte);
+
+       if (unlikely(!pte_same(old_pte, vmf->orig_pte))) {
                 pte_unmap_unlock(vmf->pte, vmf->ptl);
                 goto out;
         }
  
-       /* Get the normal PTE  */
-       old_pte = ptep_get(vmf->pte);
         pte = pte_modify(old_pte, vma->vm_page_prot);
  
         /*
@@ -6163,7 +6297,7 @@ static int clear_subpage(unsigned long addr, int idx, void *arg)
  {
         struct page *page = arg;
  
-       clear_user_highpage(page + idx, addr);
+       clear_user_highpage(nth_page(page, idx), addr);
         return 0;
  }
  
@@ -6213,10 +6347,11 @@ struct copy_subpage_arg {
  static int copy_subpage(unsigned long addr, int idx, void *arg)
  {
         struct copy_subpage_arg *copy_arg = arg;
+       struct page *dst = nth_page(copy_arg->dst, idx);
+       struct page *src = nth_page(copy_arg->src, idx);
  
-       if (copy_mc_user_highpage(copy_arg->dst + idx, copy_arg->src + idx,
-                                 addr, copy_arg->vma)) {
-               memory_failure_queue(page_to_pfn(copy_arg->src + idx), 0);
+       if (copy_mc_user_highpage(dst, src, addr, copy_arg->vma)) {
+               memory_failure_queue(page_to_pfn(src), 0);
                 return -EHWPOISON;
         }
         return 0;