blk-throttle: Fix that bps of child could exceed bps limited in parent

[linux-2.6-microblaze.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 57b7b0b..582ec75 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1014,15 +1014,23 @@ void hugetlb_dup_vma_private(struct vm_area_struct *vma)
         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
         /*
          * Clear vm_private_data
+        * - For shared mappings this is a per-vma semaphore that may be
+        *   allocated in a subsequent call to hugetlb_vm_op_open.
+        *   Before clearing, make sure pointer is not associated with vma
+        *   as this will leak the structure.  This is the case when called
+        *   via clear_vma_resv_huge_pages() and hugetlb_vm_op_open has already
+        *   been called to allocate a new structure.
          * - For MAP_PRIVATE mappings, this is the reserve map which does
          *   not apply to children.  Faults generated by the children are
          *   not guaranteed to succeed, even if read-only.
-        * - For shared mappings this is a per-vma semaphore that may be
-        *   allocated in a subsequent call to hugetlb_vm_op_open.
          */
-       vma->vm_private_data = (void *)0;
-       if (!(vma->vm_flags & VM_MAYSHARE))
-               return;
+       if (vma->vm_flags & VM_MAYSHARE) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               if (vma_lock && vma_lock->vma != vma)
+                       vma->vm_private_data = NULL;
+       } else
+               vma->vm_private_data = NULL;
  }
  
  /*
@@ -2924,11 +2932,11 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
+               spin_lock_irq(&hugetlb_lock);
                 if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
                         SetHPageRestoreReserve(page);
                         h->resv_huge_pages--;
                 }
-               spin_lock_irq(&hugetlb_lock);
                 list_add(&page->lru, &h->hugepage_activelist);
                 set_page_refcounted(page);
                 /* Fall through */
@@ -4601,6 +4609,7 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
         struct resv_map *resv = vma_resv_map(vma);
  
         /*
+        * HPAGE_RESV_OWNER indicates a private mapping.
          * This new VMA should share its siblings reservation map if present.
          * The VMA will only ever have a valid reservation map pointer where
          * it is being copied for another still existing VMA.  As that VMA
@@ -4615,11 +4624,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
  
         /*
          * vma_lock structure for sharable mappings is vma specific.
-        * Clear old pointer (if copied via vm_area_dup) and create new.
+        * Clear old pointer (if copied via vm_area_dup) and allocate
+        * new structure.  Before clearing, make sure vma_lock is not
+        * for this vma.
          */
         if (vma->vm_flags & VM_MAYSHARE) {
-               vma->vm_private_data = NULL;
-               hugetlb_vma_lock_alloc(vma);
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               if (vma_lock) {
+                       if (vma_lock->vma != vma) {
+                               vma->vm_private_data = NULL;
+                               hugetlb_vma_lock_alloc(vma);
+                       } else
+                               pr_warn("HugeTLB: vma_lock already exists in %s.\n", __func__);
+               } else
+                       hugetlb_vma_lock_alloc(vma);
         }
  }
  
@@ -5096,6 +5115,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                  * unmapped and its refcount is dropped, so just clear pte here.
                  */
                 if (unlikely(!pte_present(pte))) {
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
                         /*
                          * If the pte was wr-protected by uffd-wp in any of the
                          * swap forms, meanwhile the caller does not want to
@@ -5107,6 +5127,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                                 set_huge_pte_at(mm, address, ptep,
                                                 make_pte_marker(PTE_MARKER_UFFD_WP));
                         else
+#endif
                                 huge_pte_clear(mm, address, ptep, sz);
                         spin_unlock(ptl);
                         continue;
@@ -5135,11 +5156,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct
                 tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                 if (huge_pte_dirty(pte))
                         set_page_dirty(page);
+#ifdef CONFIG_PTE_MARKER_UFFD_WP
                 /* Leave a uffd-wp pte marker if needed */
                 if (huge_pte_uffd_wp(pte) &&
                     !(zap_flags & ZAP_FLAG_DROP_MARKER))
                         set_huge_pte_at(mm, address, ptep,
                                         make_pte_marker(PTE_MARKER_UFFD_WP));
+#endif
                 hugetlb_count_sub(pages_per_huge_page(h), mm);
                 page_remove_rmap(page, vma, true);
  
@@ -5531,6 +5554,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
         return handle_userfault(&vmf, reason);
  }
  
+/*
+ * Recheck pte with pgtable lock.  Returns true if pte didn't change, or
+ * false if pte changed or is changing.
+ */
+static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm,
+                              pte_t *ptep, pte_t old_pte)
+{
+       spinlock_t *ptl;
+       bool same;
+
+       ptl = huge_pte_lock(h, mm, ptep);
+       same = pte_same(huge_ptep_get(ptep), old_pte);
+       spin_unlock(ptl);
+
+       return same;
+}
+
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         struct vm_area_struct *vma,
                         struct address_space *mapping, pgoff_t idx,
@@ -5571,10 +5611,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                 if (idx >= size)
                         goto out;
                 /* Check for page in userfault range */
-               if (userfaultfd_missing(vma))
-                       return hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr, address,
-                                                      VM_UFFD_MISSING);
+               if (userfaultfd_missing(vma)) {
+                       /*
+                        * Since hugetlb_no_page() was examining pte
+                        * without pgtable lock, we need to re-test under
+                        * lock because the pte may not be stable and could
+                        * have changed from under us.  Try to detect
+                        * either changed or during-changing ptes and retry
+                        * properly when needed.
+                        *
+                        * Note that userfaultfd is actually fine with
+                        * false positives (e.g. caused by pte changed),
+                        * but not wrong logical events (e.g. caused by
+                        * reading a pte during changing).  The latter can
+                        * confuse the userspace, so the strictness is very
+                        * much preferred.  E.g., MISSING event should
+                        * never happen on the page after UFFDIO_COPY has
+                        * correctly installed the page and returned.
+                        */
+                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                               ret = 0;
+                               goto out;
+                       }
+
+                       return hugetlb_handle_userfault(vma, mapping, idx, flags,
+                                                       haddr, address,
+                                                       VM_UFFD_MISSING);
+               }
  
                 page = alloc_huge_page(vma, haddr, 0);
                 if (IS_ERR(page)) {
@@ -5590,11 +5653,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                          * here.  Before returning error, get ptl and make
                          * sure there really is no pte entry.
                          */
-                       ptl = huge_pte_lock(h, mm, ptep);
-                       ret = 0;
-                       if (huge_pte_none(huge_ptep_get(ptep)))
+                       if (hugetlb_pte_stable(h, mm, ptep, old_pte))
                                 ret = vmf_error(PTR_ERR(page));
-                       spin_unlock(ptl);
+                       else
+                               ret = 0;
                         goto out;
                 }
                 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -5640,9 +5702,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                 if (userfaultfd_minor(vma)) {
                         unlock_page(page);
                         put_page(page);
-                       return hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr, address,
-                                                      VM_UFFD_MINOR);
+                       /* See comment in userfaultfd_missing() block above */
+                       if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) {
+                               ret = 0;
+                               goto out;
+                       }
+                       return hugetlb_handle_userfault(vma, mapping, idx, flags,
+                                                       haddr, address,
+                                                       VM_UFFD_MINOR);
                 }
         }
  
@@ -6294,8 +6361,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          * tables. If the huge page is present, then the tail
                          * pages must also be present. The ptl prevents the
                          * head page and tail pages from being rearranged in
-                        * any way. So this page must be available at this
-                        * point, unless the page refcount overflowed:
+                        * any way. As this is hugetlb, the pages will never
+                        * be p2pdma or not longterm pinable. So this page
+                        * must be available at this point, unless the page
+                        * refcount overflowed:
                          */
                         if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
                                                          flags))) {
@@ -6804,7 +6873,7 @@ void hugetlb_vma_lock_release(struct kref *kref)
         kfree(vma_lock);
  }
  
-void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
  {
         struct vm_area_struct *vma = vma_lock->vma;
  
@@ -7176,14 +7245,15 @@ retry:
                 page = pte_page(pte) +
                         ((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
                 /*
-                * try_grab_page() should always succeed here, because: a) we
-                * hold the pmd (ptl) lock, and b) we've just checked that the
-                * huge pmd (head) page is present in the page tables. The ptl
-                * prevents the head page and tail pages from being rearranged
-                * in any way. So this page must be available at this point,
-                * unless the page refcount overflowed:
+                * try_grab_page() should always be able to get the page here,
+                * because: a) we hold the pmd (ptl) lock, and b) we've just
+                * checked that the huge pmd (head) page is present in the
+                * page tables. The ptl prevents the head page and tail pages
+                * from being rearranged in any way. So this page must be
+                * available at this point, unless the page refcount
+                * overflowed:
                  */
-               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+               if (try_grab_page(page, flags)) {
                         page = NULL;
                         goto out;
                 }
@@ -7221,7 +7291,7 @@ retry:
         pte = huge_ptep_get((pte_t *)pud);
         if (pte_present(pte)) {
                 page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
-               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+               if (try_grab_page(page, flags)) {
                         page = NULL;
                         goto out;
                 }