mm/hugetlb: make walk_hugetlb_range() safe to pmd unshare
authorPeter Xu <peterx@redhat.com>
Fri, 16 Dec 2022 15:52:26 +0000 (10:52 -0500)
committerAndrew Morton <akpm@linux-foundation.org>
Thu, 19 Jan 2023 01:12:39 +0000 (17:12 -0800)
Since walk_hugetlb_range() walks the pgtable, it needs the vma lock to
make sure the pgtable page will not be freed concurrently.

Link: https://lkml.kernel.org/r/20221216155226.2043738-1-peterx@redhat.com
Signed-off-by: Peter Xu <peterx@redhat.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Reviewed-by: John Hubbard <jhubbard@nvidia.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: James Houghton <jthoughton@google.com>
Cc: Jann Horn <jannh@google.com>
Cc: Miaohe Lin <linmiaohe@huawei.com>
Cc: Muchun Song <songmuchun@bytedance.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
include/linux/pagewalk.h
mm/hmm.c
mm/pagewalk.c

index 959f52e..27a6df4 100644 (file)
@@ -21,7 +21,16 @@ struct mm_walk;
  *                     depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD.
  *                     Any folded depths (where PTRS_PER_P?D is equal to 1)
  *                     are skipped.
- * @hugetlb_entry:     if set, called for each hugetlb entry
+ * @hugetlb_entry:     if set, called for each hugetlb entry. This hook
+ *                     function is called with the vma lock held, in order to
+ *                     protect against a concurrent freeing of the pte_t* or
+ *                     the ptl. In some cases, the hook function needs to drop
+ *                     and retake the vma lock in order to avoid deadlocks
+ *                     while calling other functions. In such cases the hook
+ *                     function must either refrain from accessing the pte or
+ *                     ptl after dropping the vma lock, or else revalidate
+ *                     those items after re-acquiring the vma lock and before
+ *                     accessing them.
  * @test_walk:         caller specific callback function to determine whether
  *                     we walk over the current vma or not. Returning 0 means
  *                     "do page table walk over the current vma", returning
index 601a99c..6a151c0 100644 (file)
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -492,8 +492,21 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
        required_fault =
                hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
        if (required_fault) {
+               int ret;
+
                spin_unlock(ptl);
-               return hmm_vma_fault(addr, end, required_fault, walk);
+               hugetlb_vma_unlock_read(vma);
+               /*
+                * Avoid deadlock: drop the vma lock before calling
+                * hmm_vma_fault(), which will itself potentially take and
+                * drop the vma lock. This is also correct from a
+                * protection point of view, because there is no further
+                * use here of either pte or ptl after dropping the vma
+                * lock.
+                */
+               ret = hmm_vma_fault(addr, end, required_fault, walk);
+               hugetlb_vma_lock_read(vma);
+               return ret;
        }
 
        pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
index 7f1c9b2..d98564a 100644 (file)
@@ -302,6 +302,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
        const struct mm_walk_ops *ops = walk->ops;
        int err = 0;
 
+       hugetlb_vma_lock_read(vma);
        do {
                next = hugetlb_entry_end(h, addr, end);
                pte = huge_pte_offset(walk->mm, addr & hmask, sz);
@@ -314,6 +315,7 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end,
                if (err)
                        break;
        } while (addr = next, addr != end);
+       hugetlb_vma_unlock_read(vma);
 
        return err;
 }