hugetlb: create remove_hugetlb_page() to separate functionality

[linux-2.6-microblaze.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index a86a58e..9a263a1 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -553,7 +553,6 @@ retry:
         resv->adds_in_progress -= in_regions_needed;
  
         spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
         return add;
  }
  
@@ -743,13 +742,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
  {
         struct hugepage_subpool *spool = subpool_inode(inode);
         long rsv_adjust;
+       bool reserved = false;
  
         rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (rsv_adjust) {
+       if (rsv_adjust > 0) {
                 struct hstate *h = hstate_inode(inode);
  
-               hugetlb_acct_memory(h, 1);
+               if (!hugetlb_acct_memory(h, 1))
+                       reserved = true;
+       } else if (!rsv_adjust) {
+               reserved = true;
         }
+
+       if (!reserved)
+               pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
  }
  
  /*
@@ -1273,7 +1279,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                 int nid, nodemask_t *nodemask)
  {
-       unsigned long nr_pages = 1UL << huge_page_order(h);
+       unsigned long nr_pages = pages_per_huge_page(h);
         if (nid == NUMA_NO_NODE)
                 nid = numa_mem_id();
  
@@ -1327,6 +1333,41 @@ static inline void destroy_compound_gigantic_page(struct page *page,
                                                 unsigned int order) { }
  #endif
  
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page.  A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+
+       list_del(&page->lru);
+
+       if (HPageFreed(page)) {
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+       }
+       if (adjust_surplus) {
+               h->surplus_huge_pages--;
+               h->surplus_huge_pages_node[nid]--;
+       }
+
+       set_page_refcounted(page);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+       h->nr_huge_pages--;
+       h->nr_huge_pages_node[nid]--;
+}
+
  static void update_and_free_page(struct hstate *h, struct page *page)
  {
         int i;
@@ -1335,8 +1376,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
  
-       h->nr_huge_pages--;
-       h->nr_huge_pages_node[page_to_nid(page)]--;
         for (i = 0; i < pages_per_huge_page(h);
              i++, subpage = mem_map_next(subpage, page, i)) {
                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,19 +1383,9 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                 1 << PG_active | 1 << PG_private |
                                 1 << PG_writeback);
         }
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
-       set_page_refcounted(page);
         if (hstate_is_gigantic(h)) {
-               /*
-                * Temporarily drop the hugetlb_lock, because
-                * we might block in free_gigantic_page().
-                */
-               spin_unlock(&hugetlb_lock);
                 destroy_compound_gigantic_page(page, huge_page_order(h));
                 free_gigantic_page(page, huge_page_order(h));
-               spin_lock(&hugetlb_lock);
         } else {
                 __free_pages(page, huge_page_order(h));
         }
@@ -1421,15 +1450,12 @@ static void __free_huge_page(struct page *page)
                 h->resv_huge_pages++;
  
         if (HPageTemporary(page)) {
-               list_del(&page->lru);
-               ClearHPageTemporary(page);
+               remove_hugetlb_page(h, page, false);
                 update_and_free_page(h, page);
         } else if (h->surplus_huge_pages_node[nid]) {
                 /* remove the page from active list */
-               list_del(&page->lru);
+               remove_hugetlb_page(h, page, true);
                 update_and_free_page(h, page);
-               h->surplus_huge_pages--;
-               h->surplus_huge_pages_node[nid]--;
         } else {
                 arch_clear_hugepage_flags(page);
                 enqueue_huge_page(h, page);
@@ -1616,7 +1642,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                 gfp_mask |= __GFP_RETRY_MAYFAIL;
         if (nid == NUMA_NO_NODE)
                 nid = numa_mem_id();
-       page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+       page = __alloc_pages(gfp_mask, order, nid, nmask);
         if (page)
                 __count_vm_event(HTLB_BUDDY_PGALLOC);
         else
@@ -1714,13 +1740,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                         struct page *page =
                                 list_entry(h->hugepage_freelists[node].next,
                                           struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[node]--;
-                       if (acct_surplus) {
-                               h->surplus_huge_pages--;
-                               h->surplus_huge_pages_node[node]--;
-                       }
+                       remove_hugetlb_page(h, page, acct_surplus);
                         update_and_free_page(h, page);
                         ret = 1;
                         break;
@@ -1758,7 +1778,6 @@ retry:
         if (!page_count(page)) {
                 struct page *head = compound_head(page);
                 struct hstate *h = page_hstate(head);
-               int nid = page_to_nid(head);
                 if (h->free_huge_pages - h->resv_huge_pages == 0)
                         goto out;
  
@@ -1789,9 +1808,7 @@ retry:
                         SetPageHWPoison(page);
                         ClearPageHWPoison(head);
                 }
-               list_del(&head->lru);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
+               remove_hugetlb_page(h, page, false);
                 h->max_huge_pages--;
                 update_and_free_page(h, head);
                 rc = 0;
@@ -2175,27 +2192,26 @@ static long __vma_reservation_common(struct hstate *h,
  
         if (vma->vm_flags & VM_MAYSHARE)
                 return ret;
-       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-               /*
-                * In most cases, reserves always exist for private mappings.
-                * However, a file associated with mapping could have been
-                * hole punched or truncated after reserves were consumed.
-                * As subsequent fault on such a range will not use reserves.
-                * Subtle - The reserve map for private mappings has the
-                * opposite meaning than that of shared mappings.  If NO
-                * entry is in the reserve map, it means a reservation exists.
-                * If an entry exists in the reserve map, it means the
-                * reservation has already been consumed.  As a result, the
-                * return value of this routine is the opposite of the
-                * value returned from reserve map manipulation routines above.
-                */
-               if (ret)
-                       return 0;
-               else
-                       return 1;
-       }
-       else
-               return ret < 0 ? ret : 0;
+       /*
+        * We know private mapping must have HPAGE_RESV_OWNER set.
+        *
+        * In most cases, reserves always exist for private mappings.
+        * However, a file associated with mapping could have been
+        * hole punched or truncated after reserves were consumed.
+        * As subsequent fault on such a range will not use reserves.
+        * Subtle - The reserve map for private mappings has the
+        * opposite meaning than that of shared mappings.  If NO
+        * entry is in the reserve map, it means a reservation exists.
+        * If an entry exists in the reserve map, it means the
+        * reservation has already been consumed.  As a result, the
+        * return value of this routine is the opposite of the
+        * value returned from reserve map manipulation routines above.
+        */
+       if (ret > 0)
+               return 0;
+       if (ret == 0)
+               return 1;
+       return ret;
  }
  
  static long vma_needs_reservation(struct hstate *h,
@@ -2316,7 +2332,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
  
         /* If this allocation is not consuming a reservation, charge it now.
          */
-       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       deferred_reserve = map_chg || avoid_reserve;
         if (deferred_reserve) {
                 ret = hugetlb_cgroup_charge_cgroup_rsvd(
                         idx, pages_per_huge_page(h), &h_cg);
@@ -2559,10 +2575,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
                                 return;
                         if (PageHighMem(page))
                                 continue;
-                       list_del(&page->lru);
+                       remove_hugetlb_page(h, page, false);
                         update_and_free_page(h, page);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[page_to_nid(page)]--;
                 }
         }
  }
@@ -2622,6 +2636,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         else
                 return -ENOMEM;
  
+       /*
+        * resize_lock mutex prevents concurrent adjustments to number of
+        * pages in hstate via the proc/sysfs interfaces.
+        */
+       mutex_lock(&h->resize_lock);
         spin_lock(&hugetlb_lock);
  
         /*
@@ -2654,6 +2673,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                 if (count > persistent_huge_pages(h)) {
                         spin_unlock(&hugetlb_lock);
+                       mutex_unlock(&h->resize_lock);
                         NODEMASK_FREE(node_alloc_noretry);
                         return -EINVAL;
                 }
@@ -2728,6 +2748,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
  out:
         h->max_huge_pages = persistent_huge_pages(h);
         spin_unlock(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
  
         NODEMASK_FREE(node_alloc_noretry);
  
@@ -3215,6 +3236,7 @@ void __init hugetlb_add_hstate(unsigned int order)
         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
         BUG_ON(order == 0);
         h = &hstates[hugetlb_max_hstate++];
+       mutex_init(&h->resize_lock);
         h->order = order;
         h->mask = ~(huge_page_size(h) - 1);
         for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3289,10 @@ static int __init hugepages_setup(char *s)
  
         /*
          * Global state is always initialized later in hugetlb_init.
-        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * But we need to allocate gigantic hstates here early to still
          * use the bootmem allocator.
          */
-       if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
                 hugetlb_hstate_alloc_pages(parsed_hstate);
  
         last_mhp = mhp;
@@ -3795,7 +3817,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 src_pte = huge_pte_offset(src, addr, sz);
                 if (!src_pte)
                         continue;
-               dst_pte = huge_pte_alloc(dst, addr, sz);
+               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
                 if (!dst_pte) {
                         ret = -ENOMEM;
                         break;
@@ -4395,13 +4417,10 @@ retry:
                          * sure there really is no pte entry.
                          */
                         ptl = huge_pte_lock(h, mm, ptep);
-                       if (!huge_pte_none(huge_ptep_get(ptep))) {
-                               ret = 0;
-                               spin_unlock(ptl);
-                               goto out;
-                       }
+                       ret = 0;
+                       if (huge_pte_none(huge_ptep_get(ptep)))
+                               ret = vmf_error(PTR_ERR(page));
                         spin_unlock(ptl);
-                       ret = vmf_error(PTR_ERR(page));
                         goto out;
                 }
                 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4563,7 +4582,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         mapping = vma->vm_file->f_mapping;
         i_mmap_lock_read(mapping);
-       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
         if (!ptep) {
                 i_mmap_unlock_read(mapping);
                 return VM_FAULT_OOM;
@@ -4996,14 +5015,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
         return i ? i : err;
  }
  
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
-#endif
-
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot)
  {
@@ -5280,6 +5291,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
         /*
          * If the subpool has a minimum size, the number of global
          * reservations to be released may be adjusted.
+        *
+        * Note that !resv_map implies freed == 0. So (chg - freed)
+        * won't go negative.
          */
         gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
         hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5340,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
         return false;
  }
  
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
+       return vma_shareable(vma, addr);
+}
+
  /*
   * Determine if start,end range within vma could be mapped by shared pmd.
   * If yes, adjust start and end to cover range associated with possible
@@ -5370,9 +5393,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
   * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
   * only required for subsequent processing.
   */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
  {
-       struct vm_area_struct *vma = find_vma(mm, addr);
         struct address_space *mapping = vma->vm_file->f_mapping;
         pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                         vma->vm_pgoff;
@@ -5382,9 +5405,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
         pte_t *pte;
         spinlock_t *ptl;
  
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
-
         i_mmap_assert_locked(mapping);
         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                 if (svma == vma)
@@ -5448,9 +5468,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
         return 1;
  }
-#define want_pmd_share()       (1)
+
  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
  {
         return NULL;
  }
@@ -5465,11 +5486,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                 unsigned long *start, unsigned long *end)
  {
  }
-#define want_pmd_share()       (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+       return false;
+}
  #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
  
  #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long addr, unsigned long sz)
  {
         pgd_t *pgd;
@@ -5487,8 +5512,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                         pte = (pte_t *)pud;
                 } else {
                         BUG_ON(sz != PMD_SIZE);
-                       if (want_pmd_share() && pud_none(*pud))
-                               pte = huge_pmd_share(mm, addr, pud);
+                       if (want_pmd_share(vma, addr) && pud_none(*pud))
+                               pte = huge_pmd_share(mm, vma, addr, pud);
                         else
                                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
                 }
@@ -5679,6 +5704,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                 SetHPageTemporary(oldpage);
                 ClearHPageTemporary(newpage);
  
+               /*
+                * There is no need to transfer the per-node surplus state
+                * when we do not cross the node.
+                */
+               if (new_nid == old_nid)
+                       return;
                 spin_lock(&hugetlb_lock);
                 if (h->surplus_huge_pages_node[old_nid]) {
                         h->surplus_huge_pages_node[old_nid]--;
@@ -5688,6 +5719,57 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
         }
  }
  
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_notifier_range range;
+       unsigned long address, start, end;
+       spinlock_t *ptl;
+       pte_t *ptep;
+
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       start = ALIGN(vma->vm_start, PUD_SIZE);
+       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return;
+
+       /*
+        * No need to call adjust_range_if_pmd_sharing_possible(), because
+        * we have already done the PUD_SIZE alignment.
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       for (address = start; address < end; address += PUD_SIZE) {
+               unsigned long tmp = address;
+
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+                       continue;
+               ptl = huge_pte_lock(h, mm, ptep);
+               /* We don't want 'address' to be changed */
+               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               spin_unlock(ptl);
+       }
+       flush_hugetlb_tlb_range(vma, start, end);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/vm/mmu_notifier.rst.
+        */
+       mmu_notifier_invalidate_range_end(&range);
+}
+
  #ifdef CONFIG_CMA
  static bool cma_reserve_called __initdata;