Merge branch 'expand-stack'
[linux-2.6-microblaze.git] / mm / memory.c
index 5ce82a7..d8a9a77 100644 (file)
@@ -77,6 +77,7 @@
 #include <linux/ptrace.h>
 #include <linux/vmalloc.h>
 #include <linux/sched/sysctl.h>
+#include <linux/net_mm.h>
 
 #include <trace/events/kmem.h>
 
@@ -699,15 +700,17 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
                                  struct page *page, unsigned long address,
                                  pte_t *ptep)
 {
+       pte_t orig_pte;
        pte_t pte;
        swp_entry_t entry;
 
+       orig_pte = ptep_get(ptep);
        pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
-       if (pte_swp_soft_dirty(*ptep))
+       if (pte_swp_soft_dirty(orig_pte))
                pte = pte_mksoft_dirty(pte);
 
-       entry = pte_to_swp_entry(*ptep);
-       if (pte_swp_uffd_wp(*ptep))
+       entry = pte_to_swp_entry(orig_pte);
+       if (pte_swp_uffd_wp(orig_pte))
                pte = pte_mkuffd_wp(pte);
        else if (is_writable_device_exclusive_entry(entry))
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
@@ -744,7 +747,7 @@ static int
 try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
                        unsigned long addr)
 {
-       swp_entry_t entry = pte_to_swp_entry(*src_pte);
+       swp_entry_t entry = pte_to_swp_entry(ptep_get(src_pte));
        struct page *page = pfn_swap_entry_to_page(entry);
 
        if (trylock_page(page)) {
@@ -768,9 +771,10 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 {
        unsigned long vm_flags = dst_vma->vm_flags;
-       pte_t pte = *src_pte;
+       pte_t orig_pte = ptep_get(src_pte);
+       pte_t pte = orig_pte;
        struct page *page;
-       swp_entry_t entry = pte_to_swp_entry(pte);
+       swp_entry_t entry = pte_to_swp_entry(orig_pte);
 
        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
@@ -785,8 +789,8 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        spin_unlock(&mmlist_lock);
                }
                /* Mark the swap entry as shared. */
-               if (pte_swp_exclusive(*src_pte)) {
-                       pte = pte_swp_clear_exclusive(*src_pte);
+               if (pte_swp_exclusive(orig_pte)) {
+                       pte = pte_swp_clear_exclusive(orig_pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
                rss[MM_SWAPENTS]++;
@@ -805,9 +809,9 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        entry = make_readable_migration_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
-                       if (pte_swp_soft_dirty(*src_pte))
+                       if (pte_swp_soft_dirty(orig_pte))
                                pte = pte_swp_mksoft_dirty(pte);
-                       if (pte_swp_uffd_wp(*src_pte))
+                       if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
@@ -840,7 +844,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        entry = make_readable_device_private_entry(
                                                        swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
-                       if (pte_swp_uffd_wp(*src_pte))
+                       if (pte_swp_uffd_wp(orig_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
@@ -904,7 +908,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
        /* All done, just insert the new page copy in the child */
        pte = mk_pte(&new_folio->page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
-       if (userfaultfd_pte_wp(dst_vma, *src_pte))
+       if (userfaultfd_pte_wp(dst_vma, ptep_get(src_pte)))
                /* Uffd-wp needs to be delivered to dest pte as well */
                pte = pte_mkuffd_wp(pte);
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -922,7 +926,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 {
        struct mm_struct *src_mm = src_vma->vm_mm;
        unsigned long vm_flags = src_vma->vm_flags;
-       pte_t pte = *src_pte;
+       pte_t pte = ptep_get(src_pte);
        struct page *page;
        struct folio *folio;
 
@@ -1002,6 +1006,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
        struct mm_struct *src_mm = src_vma->vm_mm;
        pte_t *orig_src_pte, *orig_dst_pte;
        pte_t *src_pte, *dst_pte;
+       pte_t ptent;
        spinlock_t *src_ptl, *dst_ptl;
        int progress, ret = 0;
        int rss[NR_MM_COUNTERS];
@@ -1012,13 +1017,25 @@ again:
        progress = 0;
        init_rss_vec(rss);
 
+       /*
+        * copy_pmd_range()'s prior pmd_none_or_clear_bad(src_pmd), and the
+        * error handling here, assume that exclusive mmap_lock on dst and src
+        * protects anon from unexpected THP transitions; with shmem and file
+        * protected by mmap_lock-less collapse skipping areas with anon_vma
+        * (whereas vma_needs_copy() skips areas without anon_vma).  A rework
+        * can remove such assumptions later, but this is good enough for now.
+        */
        dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
        if (!dst_pte) {
                ret = -ENOMEM;
                goto out;
        }
-       src_pte = pte_offset_map(src_pmd, addr);
-       src_ptl = pte_lockptr(src_mm, src_pmd);
+       src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl);
+       if (!src_pte) {
+               pte_unmap_unlock(dst_pte, dst_ptl);
+               /* ret == 0 */
+               goto out;
+       }
        spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
        orig_src_pte = src_pte;
        orig_dst_pte = dst_pte;
@@ -1035,17 +1052,18 @@ again:
                            spin_needbreak(src_ptl) || spin_needbreak(dst_ptl))
                                break;
                }
-               if (pte_none(*src_pte)) {
+               ptent = ptep_get(src_pte);
+               if (pte_none(ptent)) {
                        progress++;
                        continue;
                }
-               if (unlikely(!pte_present(*src_pte))) {
+               if (unlikely(!pte_present(ptent))) {
                        ret = copy_nonpresent_pte(dst_mm, src_mm,
                                                  dst_pte, src_pte,
                                                  dst_vma, src_vma,
                                                  addr, rss);
                        if (ret == -EIO) {
-                               entry = pte_to_swp_entry(*src_pte);
+                               entry = pte_to_swp_entry(ptep_get(src_pte));
                                break;
                        } else if (ret == -EBUSY) {
                                break;
@@ -1083,8 +1101,7 @@ again:
        } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
 
        arch_leave_lazy_mmu_mode();
-       spin_unlock(src_ptl);
-       pte_unmap(orig_src_pte);
+       pte_unmap_unlock(orig_src_pte, src_ptl);
        add_mm_rss_vec(dst_mm, rss);
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
@@ -1388,14 +1405,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
        swp_entry_t entry;
 
        tlb_change_page_size(tlb, PAGE_SIZE);
-again:
        init_rss_vec(rss);
-       start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
-       pte = start_pte;
+       start_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       if (!pte)
+               return addr;
+
        flush_tlb_batched_pending(mm);
        arch_enter_lazy_mmu_mode();
        do {
-               pte_t ptent = *pte;
+               pte_t ptent = ptep_get(pte);
                struct page *page;
 
                if (pte_none(ptent))
@@ -1507,17 +1525,10 @@ again:
         * If we forced a TLB flush (either due to running out of
         * batch buffers or because we needed to flush dirty TLB
         * entries before releasing the ptl), free the batched
-        * memory too. Restart if we didn't do everything.
+        * memory too. Come back again if we didn't do everything.
         */
-       if (force_flush) {
-               force_flush = 0;
+       if (force_flush)
                tlb_flush_mmu(tlb);
-       }
-
-       if (addr != end) {
-               cond_resched();
-               goto again;
-       }
 
        return addr;
 }
@@ -1536,8 +1547,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE)
                                __split_huge_pmd(vma, pmd, addr, false, NULL);
-                       else if (zap_huge_pmd(tlb, vma, pmd, addr))
-                               goto next;
+                       else if (zap_huge_pmd(tlb, vma, pmd, addr)) {
+                               addr = next;
+                               continue;
+                       }
                        /* fall through */
                } else if (details && details->single_folio &&
                           folio_test_pmd_mappable(details->single_folio) &&
@@ -1550,20 +1563,14 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                         */
                        spin_unlock(ptl);
                }
-
-               /*
-                * Here there can be other concurrent MADV_DONTNEED or
-                * trans huge page faults running, and if the pmd is
-                * none or trans huge it can change under us. This is
-                * because MADV_DONTNEED holds the mmap_lock in read
-                * mode.
-                */
-               if (pmd_none_or_trans_huge_or_clear_bad(pmd))
-                       goto next;
-               next = zap_pte_range(tlb, vma, pmd, addr, next, details);
-next:
-               cond_resched();
-       } while (pmd++, addr = next, addr != end);
+               if (pmd_none(*pmd)) {
+                       addr = next;
+                       continue;
+               }
+               addr = zap_pte_range(tlb, vma, pmd, addr, next, details);
+               if (addr != next)
+                       pmd--;
+       } while (pmd++, cond_resched(), addr != end);
 
        return addr;
 }
@@ -1821,7 +1828,7 @@ static int validate_page_before_insert(struct page *page)
 static int insert_page_into_pte_locked(struct vm_area_struct *vma, pte_t *pte,
                        unsigned long addr, struct page *page, pgprot_t prot)
 {
-       if (!pte_none(*pte))
+       if (!pte_none(ptep_get(pte)))
                return -EBUSY;
        /* Ok, finally just insert the thing.. */
        get_page(page);
@@ -1905,6 +1912,10 @@ more:
                const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
 
                start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+               if (!start_pte) {
+                       ret = -EFAULT;
+                       goto out;
+               }
                for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
                        int err = insert_page_in_batch_locked(vma, pte,
                                addr, pages[curr_page_idx], prot);
@@ -2111,7 +2122,8 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
        pte = get_locked_pte(mm, addr, &ptl);
        if (!pte)
                return VM_FAULT_OOM;
-       if (!pte_none(*pte)) {
+       entry = ptep_get(pte);
+       if (!pte_none(entry)) {
                if (mkwrite) {
                        /*
                         * For read faults on private mappings the PFN passed
@@ -2123,11 +2135,11 @@ static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         * allocation and mapping invalidation so just skip the
                         * update.
                         */
-                       if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
-                               WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
+                       if (pte_pfn(entry) != pfn_t_to_pfn(pfn)) {
+                               WARN_ON_ONCE(!is_zero_pfn(pte_pfn(entry)));
                                goto out_unlock;
                        }
-                       entry = pte_mkyoung(*pte);
+                       entry = pte_mkyoung(entry);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        if (ptep_set_access_flags(vma, addr, pte, entry, 1))
                                update_mmu_cache(vma, addr, pte);
@@ -2339,7 +2351,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                return -ENOMEM;
        arch_enter_lazy_mmu_mode();
        do {
-               BUG_ON(!pte_none(*pte));
+               BUG_ON(!pte_none(ptep_get(pte)));
                if (!pfn_modify_allowed(pfn, prot)) {
                        err = -EACCES;
                        break;
@@ -2572,15 +2584,15 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
                mapped_pte = pte = (mm == &init_mm) ?
                        pte_offset_kernel(pmd, addr) :
                        pte_offset_map_lock(mm, pmd, addr, &ptl);
+               if (!pte)
+                       return -EINVAL;
        }
 
-       BUG_ON(pmd_huge(*pmd));
-
        arch_enter_lazy_mmu_mode();
 
        if (fn) {
                do {
-                       if (create || !pte_none(*pte)) {
+                       if (create || !pte_none(ptep_get(pte))) {
                                err = fn(pte++, addr, data);
                                if (err)
                                        break;
@@ -2781,10 +2793,9 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
        int same = 1;
 #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
        if (sizeof(pte_t) > sizeof(unsigned long)) {
-               spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
-               spin_lock(ptl);
-               same = pte_same(*vmf->pte, vmf->orig_pte);
-               spin_unlock(ptl);
+               spin_lock(vmf->ptl);
+               same = pte_same(ptep_get(vmf->pte), vmf->orig_pte);
+               spin_unlock(vmf->ptl);
        }
 #endif
        pte_unmap(vmf->pte);
@@ -2804,7 +2815,6 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
        int ret;
        void *kaddr;
        void __user *uaddr;
-       bool locked = false;
        struct vm_area_struct *vma = vmf->vma;
        struct mm_struct *mm = vma->vm_mm;
        unsigned long addr = vmf->address;
@@ -2830,17 +2840,18 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
         * On architectures with software "accessed" bits, we would
         * take a double page fault, so mark it accessed here.
         */
+       vmf->pte = NULL;
        if (!arch_has_hw_pte_young() && !pte_young(vmf->orig_pte)) {
                pte_t entry;
 
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-               locked = true;
-               if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+               if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /*
                         * Other thread has already handled the fault
                         * and update local tlb only
                         */
-                       update_mmu_tlb(vma, addr, vmf->pte);
+                       if (vmf->pte)
+                               update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }
@@ -2857,15 +2868,15 @@ static inline int __wp_page_copy_user(struct page *dst, struct page *src,
         * zeroes.
         */
        if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
-               if (locked)
+               if (vmf->pte)
                        goto warn;
 
                /* Re-validate under PTL if the page is still mapped */
                vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
-               locked = true;
-               if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+               if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        /* The PTE changed under us, update local tlb */
-                       update_mmu_tlb(vma, addr, vmf->pte);
+                       if (vmf->pte)
+                               update_mmu_tlb(vma, addr, vmf->pte);
                        ret = -EAGAIN;
                        goto pte_unlock;
                }
@@ -2888,7 +2899,7 @@ warn:
        ret = 0;
 
 pte_unlock:
-       if (locked)
+       if (vmf->pte)
                pte_unmap_unlock(vmf->pte, vmf->ptl);
        kunmap_atomic(kaddr);
        flush_dcache_page(dst);
@@ -3110,7 +3121,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
         * Re-check the pte - we dropped the lock
         */
        vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl);
-       if (likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+       if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                if (old_folio) {
                        if (!folio_test_anon(old_folio)) {
                                dec_mm_counter(mm, mm_counter_file(&old_folio->page));
@@ -3178,19 +3189,20 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                /* Free the old page.. */
                new_folio = old_folio;
                page_copied = 1;
-       } else {
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
+       } else if (vmf->pte) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        }
 
-       if (new_folio)
-               folio_put(new_folio);
-
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback as
         * the above ptep_clear_flush_notify() did already call it.
         */
        mmu_notifier_invalidate_range_only_end(&range);
+
+       if (new_folio)
+               folio_put(new_folio);
        if (old_folio) {
                if (page_copied)
                        free_swap_cache(&old_folio->page);
@@ -3230,11 +3242,13 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
        WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED));
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address,
                                       &vmf->ptl);
+       if (!vmf->pte)
+               return VM_FAULT_NOPAGE;
        /*
         * We might have raced with another page fault while we released the
         * pte_offset_map_lock.
         */
-       if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+       if (!pte_same(ptep_get(vmf->pte), vmf->orig_pte)) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                return VM_FAULT_NOPAGE;
@@ -3329,7 +3343,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
        struct folio *folio = NULL;
 
        if (likely(!unshare)) {
-               if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+               if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        return handle_userfault(vmf, VM_UFFD_WP);
                }
@@ -3388,8 +3402,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                        goto copy;
                if (!folio_test_lru(folio))
                        /*
-                        * Note: We cannot easily detect+handle references from
-                        * remote LRU pagevecs or references to LRU folios.
+                        * We cannot easily detect+handle references from
+                        * remote LRU caches or references to LRU folios.
                         */
                        lru_add_drain();
                if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio))
@@ -3591,10 +3605,11 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                                &vmf->ptl);
-       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+       if (likely(vmf->pte && pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte);
 
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        folio_unlock(folio);
        folio_put(folio);
 
@@ -3625,6 +3640,8 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
 {
        vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
                                       vmf->address, &vmf->ptl);
+       if (!vmf->pte)
+               return 0;
        /*
         * Be careful so that we will only recover a special uffd-wp pte into a
         * none pte.  Otherwise it means the pte could have changed, so retry.
@@ -3633,7 +3650,7 @@ static vm_fault_t pte_marker_clear(struct vm_fault *vmf)
         * quickly from a PTE_MARKER_UFFD_WP into PTE_MARKER_SWAPIN_ERROR.
         * So is_pte_marker() check is not enough to safely drop the pte.
         */
-       if (pte_same(vmf->orig_pte, *vmf->pte))
+       if (pte_same(vmf->orig_pte, ptep_get(vmf->pte)))
                pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
        pte_unmap_unlock(vmf->pte, vmf->ptl);
        return 0;
@@ -3728,10 +3745,10 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        vmf->page = pfn_swap_entry_to_page(entry);
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
-                       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
-                               spin_unlock(vmf->ptl);
-                               goto out;
-                       }
+                       if (unlikely(!vmf->pte ||
+                                    !pte_same(ptep_get(vmf->pte),
+                                                       vmf->orig_pte)))
+                               goto unlock;
 
                        /*
                         * Get a page reference while we know the page can't be
@@ -3807,7 +3824,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                         */
                        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                        vmf->address, &vmf->ptl);
-                       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+                       if (likely(vmf->pte &&
+                                  pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
                        goto unlock;
                }
@@ -3863,7 +3881,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * If we want to map a page that's in the swapcache writable, we
                 * have to detect via the refcount if we're really the exclusive
                 * owner. Try removing the extra reference from the local LRU
-                * pagevecs if required.
+                * caches if required.
                 */
                if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache &&
                    !folio_test_ksm(folio) && !folio_test_lru(folio))
@@ -3877,7 +3895,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
         */
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
-       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte)))
+       if (unlikely(!vmf->pte || !pte_same(ptep_get(vmf->pte), vmf->orig_pte)))
                goto out_nomap;
 
        if (unlikely(!folio_test_uptodate(folio))) {
@@ -4003,13 +4021,15 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
        if (si)
                put_swap_device(si);
        return ret;
 out_nomap:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
 out_page:
        folio_unlock(folio);
 out_release:
@@ -4041,22 +4061,12 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                return VM_FAULT_SIGBUS;
 
        /*
-        * Use pte_alloc() instead of pte_alloc_map().  We can't run
-        * pte_offset_map() on pmds where a huge pmd might be created
-        * from a different thread.
-        *
-        * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
-        * parallel threads are excluded by other means.
-        *
-        * Here we only have mmap_read_lock(mm).
+        * Use pte_alloc() instead of pte_alloc_map(), so that OOM can
+        * be distinguished from a transient failure of pte_offset_map().
         */
        if (pte_alloc(vma->vm_mm, vmf->pmd))
                return VM_FAULT_OOM;
 
-       /* See comment in handle_pte_fault() */
-       if (unlikely(pmd_trans_unstable(vmf->pmd)))
-               return 0;
-
        /* Use the zero-page for reads */
        if (!(vmf->flags & FAULT_FLAG_WRITE) &&
                        !mm_forbids_zeropage(vma->vm_mm)) {
@@ -4064,6 +4074,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
                                                vma->vm_page_prot));
                vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                vmf->address, &vmf->ptl);
+               if (!vmf->pte)
+                       goto unlock;
                if (vmf_pte_changed(vmf)) {
                        update_mmu_tlb(vma, vmf->address, vmf->pte);
                        goto unlock;
@@ -4104,6 +4116,8 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
 
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
                        &vmf->ptl);
+       if (!vmf->pte)
+               goto release;
        if (vmf_pte_changed(vmf)) {
                update_mmu_tlb(vma, vmf->address, vmf->pte);
                goto release;
@@ -4131,7 +4145,8 @@ setpte:
        /* No need to invalidate - it was non-present before */
        update_mmu_cache(vma, vmf->address, vmf->pte);
 unlock:
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       if (vmf->pte)
+               pte_unmap_unlock(vmf->pte, vmf->ptl);
        return ret;
 release:
        folio_put(folio);
@@ -4325,9 +4340,9 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 static bool vmf_pte_changed(struct vm_fault *vmf)
 {
        if (vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)
-               return !pte_same(*vmf->pte, vmf->orig_pte);
+               return !pte_same(ptep_get(vmf->pte), vmf->orig_pte);
 
-       return !pte_none(*vmf->pte);
+       return !pte_none(ptep_get(vmf->pte));
 }
 
 /**
@@ -4380,15 +4395,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
                        return VM_FAULT_OOM;
        }
 
-       /*
-        * See comment in handle_pte_fault() for how this scenario happens, we
-        * need to return NOPAGE so that we drop this page.
-        */
-       if (pmd_devmap_trans_unstable(vmf->pmd))
-               return VM_FAULT_NOPAGE;
-
        vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
                                      vmf->address, &vmf->ptl);
+       if (!vmf->pte)
+               return VM_FAULT_NOPAGE;
 
        /* Re-check under ptl */
        if (likely(!vmf_pte_changed(vmf))) {
@@ -4630,17 +4640,11 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
         * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
         */
        if (!vma->vm_ops->fault) {
-               /*
-                * If we find a migration pmd entry or a none pmd entry, which
-                * should never happen, return SIGBUS
-                */
-               if (unlikely(!pmd_present(*vmf->pmd)))
+               vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+                                              vmf->address, &vmf->ptl);
+               if (unlikely(!vmf->pte))
                        ret = VM_FAULT_SIGBUS;
                else {
-                       vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm,
-                                                      vmf->pmd,
-                                                      vmf->address,
-                                                      &vmf->ptl);
                        /*
                         * Make sure this is not a temporary clearing of pte
                         * by holding ptl and checking again. A R/M/W update
@@ -4648,7 +4652,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
                         * we don't have concurrent modification by hardware
                         * followed by an update.
                         */
-                       if (unlikely(pte_none(*vmf->pte)))
+                       if (unlikely(pte_none(ptep_get(vmf->pte))))
                                ret = VM_FAULT_SIGBUS;
                        else
                                ret = VM_FAULT_NOPAGE;
@@ -4703,9 +4707,8 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         * validation through pte_unmap_same(). It's of NUMA type but
         * the pfn may be screwed if the read is non atomic.
         */
-       vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
-       if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+       if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                pte_unmap_unlock(vmf->pte, vmf->ptl);
                goto out;
        }
@@ -4774,9 +4777,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
                flags |= TNF_MIGRATED;
        } else {
                flags |= TNF_MIGRATE_FAIL;
-               vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
-               spin_lock(vmf->ptl);
-               if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+               vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
+                                              vmf->address, &vmf->ptl);
+               if (unlikely(!vmf->pte))
+                       goto out;
+               if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
                        pte_unmap_unlock(vmf->pte, vmf->ptl);
                        goto out;
                }
@@ -4904,39 +4909,19 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
                vmf->pte = NULL;
                vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID;
        } else {
-               /*
-                * If a huge pmd materialized under us just retry later.  Use
-                * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead
-                * of pmd_trans_huge() to ensure the pmd didn't become
-                * pmd_trans_huge under us and then back to pmd_none, as a
-                * result of MADV_DONTNEED running immediately after a huge pmd
-                * fault in a different thread of this mm, in turn leading to a
-                * misleading pmd_trans_huge() retval. All we have to ensure is
-                * that it is a regular pmd that we can walk with
-                * pte_offset_map() and we can do that through an atomic read
-                * in C, which is what pmd_trans_unstable() provides.
-                */
-               if (pmd_devmap_trans_unstable(vmf->pmd))
-                       return 0;
                /*
                 * A regular pmd is established and it can't morph into a huge
-                * pmd from under us anymore at this point because we hold the
-                * mmap_lock read mode and khugepaged takes it in write mode.
-                * So now it's safe to run pte_offset_map().
+                * pmd by anon khugepaged, since that takes mmap_lock in write
+                * mode; but shmem or file collapse to THP could still morph
+                * it into a huge pmd: just retry later if so.
                 */
-               vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
-               vmf->orig_pte = *vmf->pte;
+               vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd,
+                                                vmf->address, &vmf->ptl);
+               if (unlikely(!vmf->pte))
+                       return 0;
+               vmf->orig_pte = ptep_get_lockless(vmf->pte);
                vmf->flags |= FAULT_FLAG_ORIG_PTE_VALID;
 
-               /*
-                * some architectures can have larger ptes than wordsize,
-                * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and
-                * CONFIG_32BIT=y, so READ_ONCE cannot guarantee atomic
-                * accesses.  The code below just needs a consistent view
-                * for the ifs and we later double check anyway with the
-                * ptl lock held. So here a barrier will do.
-                */
-               barrier();
                if (pte_none(vmf->orig_pte)) {
                        pte_unmap(vmf->pte);
                        vmf->pte = NULL;
@@ -4952,10 +4937,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
        if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
                return do_numa_page(vmf);
 
-       vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
        spin_lock(vmf->ptl);
        entry = vmf->orig_pte;
-       if (unlikely(!pte_same(*vmf->pte, entry))) {
+       if (unlikely(!pte_same(ptep_get(vmf->pte), entry))) {
                update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
                goto unlock;
        }
@@ -5060,9 +5044,8 @@ retry_pud:
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
-               vmf.orig_pmd = *vmf.pmd;
+               vmf.orig_pmd = pmdp_get_lockless(vmf.pmd);
 
-               barrier();
                if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
                                          !is_pmd_migration_entry(vmf.orig_pmd));
@@ -5399,12 +5382,12 @@ retry:
        if (!vma)
                goto inval;
 
-       /* Only anonymous vmas are supported for now */
-       if (!vma_is_anonymous(vma))
+       /* Only anonymous and tcp vmas are supported for now */
+       if (!vma_is_anonymous(vma) && !vma_is_tcp(vma))
                goto inval;
 
        /* find_mergeable_anon_vma uses adjacent vmas which are not locked */
-       if (!vma->anon_vma)
+       if (!vma->anon_vma && !vma_is_tcp(vma))
                goto inval;
 
        if (!vma_start_read(vma))
@@ -5558,11 +5541,10 @@ int follow_pte(struct mm_struct *mm, unsigned long address,
        pmd = pmd_offset(pud, address);
        VM_BUG_ON(pmd_trans_huge(*pmd));
 
-       if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
-               goto out;
-
        ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
-       if (!pte_present(*ptep))
+       if (!ptep)
+               goto out;
+       if (!pte_present(ptep_get(ptep)))
                goto unlock;
        *ptepp = ptep;
        return 0;
@@ -5599,7 +5581,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
        ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
        if (ret)
                return ret;
-       *pfn = pte_pfn(*ptep);
+       *pfn = pte_pfn(ptep_get(ptep));
        pte_unmap_unlock(ptep, ptl);
        return 0;
 }
@@ -5619,7 +5601,7 @@ int follow_phys(struct vm_area_struct *vma,
 
        if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
                goto out;
-       pte = *ptep;
+       pte = ptep_get(ptep);
 
        if ((flags & FOLL_WRITE) && !pte_write(pte))
                goto unlock;
@@ -5663,7 +5645,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
 retry:
        if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
                return -EINVAL;
-       pte = *ptep;
+       pte = ptep_get(ptep);
        pte_unmap_unlock(ptep, ptl);
 
        prot = pgprot_val(pte_pgprot(pte));
@@ -5679,7 +5661,7 @@ retry:
        if (follow_pte(vma->vm_mm, addr, &ptep, &ptl))
                goto out_unmap;
 
-       if (!pte_same(pte, *ptep)) {
+       if (!pte_same(pte, ptep_get(ptep))) {
                pte_unmap_unlock(ptep, ptl);
                iounmap(maddr);
 
@@ -5706,47 +5688,47 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
 int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
                       int len, unsigned int gup_flags)
 {
-       struct vm_area_struct *vma;
        void *old_buf = buf;
        int write = gup_flags & FOLL_WRITE;
 
        if (mmap_read_lock_killable(mm))
                return 0;
 
-       /* We might need to expand the stack to access it */
-       vma = vma_lookup(mm, addr);
-       if (!vma) {
-               vma = expand_stack(mm, addr);
-               if (!vma)
-                       return 0;
-       }
-
        /* ignore errors, just check how much was successfully transferred */
        while (len) {
-               int bytes, ret, offset;
+               int bytes, offset;
                void *maddr;
-               struct page *page = NULL;
+               struct vm_area_struct *vma = NULL;
+               struct page *page = get_user_page_vma_remote(mm, addr,
+                                                            gup_flags, &vma);
+
+               if (IS_ERR_OR_NULL(page)) {
+                       /* We might need to expand the stack to access it */
+                       vma = vma_lookup(mm, addr);
+                       if (!vma) {
+                               vma = expand_stack(mm, addr);
+
+                               /* mmap_lock was dropped on failure */
+                               if (!vma)
+                                       return buf - old_buf;
+
+                               /* Try again if stack expansion worked */
+                               continue;
+                       }
+
 
-               ret = get_user_pages_remote(mm, addr, 1,
-                               gup_flags, &page, &vma, NULL);
-               if (ret <= 0) {
-#ifndef CONFIG_HAVE_IOREMAP_PROT
-                       break;
-#else
                        /*
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
-                       vma = vma_lookup(mm, addr);
-                       if (!vma)
-                               break;
+                       bytes = 0;
+#ifdef CONFIG_HAVE_IOREMAP_PROT
                        if (vma->vm_ops && vma->vm_ops->access)
-                               ret = vma->vm_ops->access(vma, addr, buf,
-                                                         len, write);
-                       if (ret <= 0)
-                               break;
-                       bytes = ret;
+                               bytes = vma->vm_ops->access(vma, addr, buf,
+                                                           len, write);
 #endif
+                       if (bytes <= 0)
+                               break;
                } else {
                        bytes = len;
                        offset = addr & (PAGE_SIZE-1);