mm/gup: retire follow_hugetlb_page()
authorPeter Xu <peterx@redhat.com>
Wed, 28 Jun 2023 21:53:08 +0000 (17:53 -0400)
committerAndrew Morton <akpm@linux-foundation.org>
Fri, 18 Aug 2023 17:12:04 +0000 (10:12 -0700)
Now __get_user_pages() should be well prepared to handle thp completely,
as long as hugetlb gup requests even without the hugetlb's special path.

Time to retire follow_hugetlb_page().

Tweak misc comments to reflect reality of follow_hugetlb_page()'s removal.

Link: https://lkml.kernel.org/r/20230628215310.73782-7-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: Kirill A . Shutemov <kirill@shutemov.name>
Cc: Lorenzo Stoakes <lstoakes@gmail.com>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Mike Rapoport (IBM) <rppt@kernel.org>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
fs/userfaultfd.c
include/linux/hugetlb.h
mm/gup.c
mm/hugetlb.c

index 7cecd49..ae711f1 100644 (file)
@@ -427,7 +427,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
         *
         * We also don't do userfault handling during
         * coredumping. hugetlbfs has the special
-        * follow_hugetlb_page() to skip missing pages in the
+        * hugetlb_follow_page_mask() to skip missing pages in the
         * FOLL_DUMP case, anon memory also checks for FOLL_DUMP with
         * the no_page_table() helper in follow_page_mask(), but the
         * shmem_vm_ops->fault method is invoked even during
index 9f282f3..9bc3c2d 100644 (file)
@@ -133,9 +133,6 @@ int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
 struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                                      unsigned long address, unsigned int flags,
                                      unsigned int *page_mask);
-long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
-                        struct page **, unsigned long *, unsigned long *,
-                        long, unsigned int, int *);
 void unmap_hugepage_range(struct vm_area_struct *,
                          unsigned long, unsigned long, struct page *,
                          zap_flags_t);
@@ -305,15 +302,6 @@ static inline struct page *hugetlb_follow_page_mask(
        BUILD_BUG(); /* should never be compiled in if !CONFIG_HUGETLB_PAGE*/
 }
 
-static inline long follow_hugetlb_page(struct mm_struct *mm,
-                       struct vm_area_struct *vma, struct page **pages,
-                       unsigned long *position, unsigned long *nr_pages,
-                       long i, unsigned int flags, int *nonblocking)
-{
-       BUG();
-       return 0;
-}
-
 static inline int copy_hugetlb_page_range(struct mm_struct *dst,
                                          struct mm_struct *src,
                                          struct vm_area_struct *dst_vma,
index 59e1826..2493ffa 100644 (file)
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -819,9 +819,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
         * Call hugetlb_follow_page_mask for hugetlb vmas as it will use
         * special hugetlb page table walking code.  This eliminates the
         * need to check for hugetlb entries in the general walking code.
-        *
-        * hugetlb_follow_page_mask is only for follow_page() handling here.
-        * Ordinary GUP uses follow_hugetlb_page for hugetlb processing.
         */
        if (is_vm_hugetlb_page(vma))
                return hugetlb_follow_page_mask(vma, address, flags,
@@ -1221,22 +1218,6 @@ static long __get_user_pages(struct mm_struct *mm,
                        ret = check_vma_flags(vma, gup_flags);
                        if (ret)
                                goto out;
-
-                       if (is_vm_hugetlb_page(vma)) {
-                               i = follow_hugetlb_page(mm, vma, pages,
-                                                       &start, &nr_pages, i,
-                                                       gup_flags, locked);
-                               if (!*locked) {
-                                       /*
-                                        * We've got a VM_FAULT_RETRY
-                                        * and we've lost mmap_lock.
-                                        * We must stop here.
-                                        */
-                                       BUG_ON(gup_flags & FOLL_NOWAIT);
-                                       goto out;
-                               }
-                               continue;
-                       }
                }
 retry:
                /*
index ab52214..e3839ee 100644 (file)
@@ -5721,7 +5721,6 @@ out_release_old:
 
 /*
  * Return whether there is a pagecache page to back given address within VMA.
- * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
  */
 static bool hugetlbfs_pagecache_present(struct hstate *h,
                        struct vm_area_struct *vma, unsigned long address)
@@ -6422,37 +6421,6 @@ out_release_nounlock:
 }
 #endif /* CONFIG_USERFAULTFD */
 
-static void record_subpages(struct page *page, struct vm_area_struct *vma,
-                           int refs, struct page **pages)
-{
-       int nr;
-
-       for (nr = 0; nr < refs; nr++) {
-               if (likely(pages))
-                       pages[nr] = nth_page(page, nr);
-       }
-}
-
-static inline bool __follow_hugetlb_must_fault(struct vm_area_struct *vma,
-                                              unsigned int flags, pte_t *pte,
-                                              bool *unshare)
-{
-       pte_t pteval = huge_ptep_get(pte);
-
-       *unshare = false;
-       if (is_swap_pte(pteval))
-               return true;
-       if (huge_pte_write(pteval))
-               return false;
-       if (flags & FOLL_WRITE)
-               return true;
-       if (gup_must_unshare(vma, flags, pte_page(pteval))) {
-               *unshare = true;
-               return true;
-       }
-       return false;
-}
-
 struct page *hugetlb_follow_page_mask(struct vm_area_struct *vma,
                                      unsigned long address, unsigned int flags,
                                      unsigned int *page_mask)
@@ -6524,198 +6492,6 @@ out_unlock:
        return page;
 }
 
-long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
-                        struct page **pages, unsigned long *position,
-                        unsigned long *nr_pages, long i, unsigned int flags,
-                        int *locked)
-{
-       unsigned long pfn_offset;
-       unsigned long vaddr = *position;
-       unsigned long remainder = *nr_pages;
-       struct hstate *h = hstate_vma(vma);
-       int err = -EFAULT, refs;
-
-       while (vaddr < vma->vm_end && remainder) {
-               pte_t *pte;
-               spinlock_t *ptl = NULL;
-               bool unshare = false;
-               int absent;
-               struct page *page;
-
-               /*
-                * If we have a pending SIGKILL, don't keep faulting pages and
-                * potentially allocating memory.
-                */
-               if (fatal_signal_pending(current)) {
-                       remainder = 0;
-                       break;
-               }
-
-               hugetlb_vma_lock_read(vma);
-               /*
-                * Some archs (sparc64, sh*) have multiple pte_ts to
-                * each hugepage.  We have to make sure we get the
-                * first, for the page indexing below to work.
-                *
-                * Note that page table lock is not held when pte is null.
-                */
-               pte = hugetlb_walk(vma, vaddr & huge_page_mask(h),
-                                  huge_page_size(h));
-               if (pte)
-                       ptl = huge_pte_lock(h, mm, pte);
-               absent = !pte || huge_pte_none(huge_ptep_get(pte));
-
-               /*
-                * When coredumping, it suits get_dump_page if we just return
-                * an error where there's an empty slot with no huge pagecache
-                * to back it.  This way, we avoid allocating a hugepage, and
-                * the sparse dumpfile avoids allocating disk blocks, but its
-                * huge holes still show up with zeroes where they need to be.
-                */
-               if (absent && (flags & FOLL_DUMP) &&
-                   !hugetlbfs_pagecache_present(h, vma, vaddr)) {
-                       if (pte)
-                               spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
-                       remainder = 0;
-                       break;
-               }
-
-               /*
-                * We need call hugetlb_fault for both hugepages under migration
-                * (in which case hugetlb_fault waits for the migration,) and
-                * hwpoisoned hugepages (in which case we need to prevent the
-                * caller from accessing to them.) In order to do this, we use
-                * here is_swap_pte instead of is_hugetlb_entry_migration and
-                * is_hugetlb_entry_hwpoisoned. This is because it simply covers
-                * both cases, and because we can't follow correct pages
-                * directly from any kind of swap entries.
-                */
-               if (absent ||
-                   __follow_hugetlb_must_fault(vma, flags, pte, &unshare)) {
-                       vm_fault_t ret;
-                       unsigned int fault_flags = 0;
-
-                       if (pte)
-                               spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
-
-                       if (flags & FOLL_WRITE)
-                               fault_flags |= FAULT_FLAG_WRITE;
-                       else if (unshare)
-                               fault_flags |= FAULT_FLAG_UNSHARE;
-                       if (locked) {
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
-                                       FAULT_FLAG_KILLABLE;
-                               if (flags & FOLL_INTERRUPTIBLE)
-                                       fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
-                       }
-                       if (flags & FOLL_NOWAIT)
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
-                                       FAULT_FLAG_RETRY_NOWAIT;
-                       if (flags & FOLL_TRIED) {
-                               /*
-                                * Note: FAULT_FLAG_ALLOW_RETRY and
-                                * FAULT_FLAG_TRIED can co-exist
-                                */
-                               fault_flags |= FAULT_FLAG_TRIED;
-                       }
-                       ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
-                       if (ret & VM_FAULT_ERROR) {
-                               err = vm_fault_to_errno(ret, flags);
-                               remainder = 0;
-                               break;
-                       }
-                       if (ret & VM_FAULT_RETRY) {
-                               if (locked &&
-                                   !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
-                                       *locked = 0;
-                               *nr_pages = 0;
-                               /*
-                                * VM_FAULT_RETRY must not return an
-                                * error, it will return zero
-                                * instead.
-                                *
-                                * No need to update "position" as the
-                                * caller will not check it after
-                                * *nr_pages is set to 0.
-                                */
-                               return i;
-                       }
-                       continue;
-               }
-
-               pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
-               page = pte_page(huge_ptep_get(pte));
-
-               VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
-                              !PageAnonExclusive(page), page);
-
-               /*
-                * If subpage information not requested, update counters
-                * and skip the same_page loop below.
-                */
-               if (!pages && !pfn_offset &&
-                   (vaddr + huge_page_size(h) < vma->vm_end) &&
-                   (remainder >= pages_per_huge_page(h))) {
-                       vaddr += huge_page_size(h);
-                       remainder -= pages_per_huge_page(h);
-                       i += pages_per_huge_page(h);
-                       spin_unlock(ptl);
-                       hugetlb_vma_unlock_read(vma);
-                       continue;
-               }
-
-               /* vaddr may not be aligned to PAGE_SIZE */
-               refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
-                   (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
-
-               if (pages)
-                       record_subpages(nth_page(page, pfn_offset),
-                                       vma, refs,
-                                       likely(pages) ? pages + i : NULL);
-
-               if (pages) {
-                       /*
-                        * try_grab_folio() should always succeed here,
-                        * because: a) we hold the ptl lock, and b) we've just
-                        * checked that the huge page is present in the page
-                        * tables. If the huge page is present, then the tail
-                        * pages must also be present. The ptl prevents the
-                        * head page and tail pages from being rearranged in
-                        * any way. As this is hugetlb, the pages will never
-                        * be p2pdma or not longterm pinable. So this page
-                        * must be available at this point, unless the page
-                        * refcount overflowed:
-                        */
-                       if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
-                                                        flags))) {
-                               spin_unlock(ptl);
-                               hugetlb_vma_unlock_read(vma);
-                               remainder = 0;
-                               err = -ENOMEM;
-                               break;
-                       }
-               }
-
-               vaddr += (refs << PAGE_SHIFT);
-               remainder -= refs;
-               i += refs;
-
-               spin_unlock(ptl);
-               hugetlb_vma_unlock_read(vma);
-       }
-       *nr_pages = remainder;
-       /*
-        * setting position is actually required only if remainder is
-        * not zero but it's faster not to add a "if (remainder)"
-        * branch.
-        */
-       *position = vaddr;
-
-       return i ? i : err;
-}
-
 long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end,
                pgprot_t newprot, unsigned long cp_flags)