hugetlb: create remove_hugetlb_page() to separate functionality
[linux-2.6-microblaze.git] / mm / hugetlb.c
index a86a58e..9a263a1 100644 (file)
@@ -553,7 +553,6 @@ retry:
        resv->adds_in_progress -= in_regions_needed;
 
        spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
        return add;
 }
 
@@ -743,13 +742,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
 {
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
+       bool reserved = false;
 
        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (rsv_adjust) {
+       if (rsv_adjust > 0) {
                struct hstate *h = hstate_inode(inode);
 
-               hugetlb_acct_memory(h, 1);
+               if (!hugetlb_acct_memory(h, 1))
+                       reserved = true;
+       } else if (!rsv_adjust) {
+               reserved = true;
        }
+
+       if (!reserved)
+               pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
 }
 
 /*
@@ -1273,7 +1279,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                int nid, nodemask_t *nodemask)
 {
-       unsigned long nr_pages = 1UL << huge_page_order(h);
+       unsigned long nr_pages = pages_per_huge_page(h);
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
 
@@ -1327,6 +1333,41 @@ static inline void destroy_compound_gigantic_page(struct page *page,
                                                unsigned int order) { }
 #endif
 
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page.  A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+
+       list_del(&page->lru);
+
+       if (HPageFreed(page)) {
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+       }
+       if (adjust_surplus) {
+               h->surplus_huge_pages--;
+               h->surplus_huge_pages_node[nid]--;
+       }
+
+       set_page_refcounted(page);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+       h->nr_huge_pages--;
+       h->nr_huge_pages_node[nid]--;
+}
+
 static void update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
@@ -1335,8 +1376,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
-       h->nr_huge_pages--;
-       h->nr_huge_pages_node[page_to_nid(page)]--;
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,19 +1383,9 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_active | 1 << PG_private |
                                1 << PG_writeback);
        }
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
-       set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
-               /*
-                * Temporarily drop the hugetlb_lock, because
-                * we might block in free_gigantic_page().
-                */
-               spin_unlock(&hugetlb_lock);
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
-               spin_lock(&hugetlb_lock);
        } else {
                __free_pages(page, huge_page_order(h));
        }
@@ -1421,15 +1450,12 @@ static void __free_huge_page(struct page *page)
                h->resv_huge_pages++;
 
        if (HPageTemporary(page)) {
-               list_del(&page->lru);
-               ClearHPageTemporary(page);
+               remove_hugetlb_page(h, page, false);
                update_and_free_page(h, page);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
-               list_del(&page->lru);
+               remove_hugetlb_page(h, page, true);
                update_and_free_page(h, page);
-               h->surplus_huge_pages--;
-               h->surplus_huge_pages_node[nid]--;
        } else {
                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
@@ -1616,7 +1642,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                gfp_mask |= __GFP_RETRY_MAYFAIL;
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
-       page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
+       page = __alloc_pages(gfp_mask, order, nid, nmask);
        if (page)
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        else
@@ -1714,13 +1740,7 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                        struct page *page =
                                list_entry(h->hugepage_freelists[node].next,
                                          struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[node]--;
-                       if (acct_surplus) {
-                               h->surplus_huge_pages--;
-                               h->surplus_huge_pages_node[node]--;
-                       }
+                       remove_hugetlb_page(h, page, acct_surplus);
                        update_and_free_page(h, page);
                        ret = 1;
                        break;
@@ -1758,7 +1778,6 @@ retry:
        if (!page_count(page)) {
                struct page *head = compound_head(page);
                struct hstate *h = page_hstate(head);
-               int nid = page_to_nid(head);
                if (h->free_huge_pages - h->resv_huge_pages == 0)
                        goto out;
 
@@ -1789,9 +1808,7 @@ retry:
                        SetPageHWPoison(page);
                        ClearPageHWPoison(head);
                }
-               list_del(&head->lru);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
+               remove_hugetlb_page(h, page, false);
                h->max_huge_pages--;
                update_and_free_page(h, head);
                rc = 0;
@@ -2175,27 +2192,26 @@ static long __vma_reservation_common(struct hstate *h,
 
        if (vma->vm_flags & VM_MAYSHARE)
                return ret;
-       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-               /*
-                * In most cases, reserves always exist for private mappings.
-                * However, a file associated with mapping could have been
-                * hole punched or truncated after reserves were consumed.
-                * As subsequent fault on such a range will not use reserves.
-                * Subtle - The reserve map for private mappings has the
-                * opposite meaning than that of shared mappings.  If NO
-                * entry is in the reserve map, it means a reservation exists.
-                * If an entry exists in the reserve map, it means the
-                * reservation has already been consumed.  As a result, the
-                * return value of this routine is the opposite of the
-                * value returned from reserve map manipulation routines above.
-                */
-               if (ret)
-                       return 0;
-               else
-                       return 1;
-       }
-       else
-               return ret < 0 ? ret : 0;
+       /*
+        * We know private mapping must have HPAGE_RESV_OWNER set.
+        *
+        * In most cases, reserves always exist for private mappings.
+        * However, a file associated with mapping could have been
+        * hole punched or truncated after reserves were consumed.
+        * As subsequent fault on such a range will not use reserves.
+        * Subtle - The reserve map for private mappings has the
+        * opposite meaning than that of shared mappings.  If NO
+        * entry is in the reserve map, it means a reservation exists.
+        * If an entry exists in the reserve map, it means the
+        * reservation has already been consumed.  As a result, the
+        * return value of this routine is the opposite of the
+        * value returned from reserve map manipulation routines above.
+        */
+       if (ret > 0)
+               return 0;
+       if (ret == 0)
+               return 1;
+       return ret;
 }
 
 static long vma_needs_reservation(struct hstate *h,
@@ -2316,7 +2332,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
        /* If this allocation is not consuming a reservation, charge it now.
         */
-       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       deferred_reserve = map_chg || avoid_reserve;
        if (deferred_reserve) {
                ret = hugetlb_cgroup_charge_cgroup_rsvd(
                        idx, pages_per_huge_page(h), &h_cg);
@@ -2559,10 +2575,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
                                return;
                        if (PageHighMem(page))
                                continue;
-                       list_del(&page->lru);
+                       remove_hugetlb_page(h, page, false);
                        update_and_free_page(h, page);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[page_to_nid(page)]--;
                }
        }
 }
@@ -2622,6 +2636,11 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        else
                return -ENOMEM;
 
+       /*
+        * resize_lock mutex prevents concurrent adjustments to number of
+        * pages in hstate via the proc/sysfs interfaces.
+        */
+       mutex_lock(&h->resize_lock);
        spin_lock(&hugetlb_lock);
 
        /*
@@ -2654,6 +2673,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                if (count > persistent_huge_pages(h)) {
                        spin_unlock(&hugetlb_lock);
+                       mutex_unlock(&h->resize_lock);
                        NODEMASK_FREE(node_alloc_noretry);
                        return -EINVAL;
                }
@@ -2728,6 +2748,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
 out:
        h->max_huge_pages = persistent_huge_pages(h);
        spin_unlock(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
 
        NODEMASK_FREE(node_alloc_noretry);
 
@@ -3215,6 +3236,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
        BUG_ON(order == 0);
        h = &hstates[hugetlb_max_hstate++];
+       mutex_init(&h->resize_lock);
        h->order = order;
        h->mask = ~(huge_page_size(h) - 1);
        for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3289,10 @@ static int __init hugepages_setup(char *s)
 
        /*
         * Global state is always initialized later in hugetlb_init.
-        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * But we need to allocate gigantic hstates here early to still
         * use the bootmem allocator.
         */
-       if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
                hugetlb_hstate_alloc_pages(parsed_hstate);
 
        last_mhp = mhp;
@@ -3795,7 +3817,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                src_pte = huge_pte_offset(src, addr, sz);
                if (!src_pte)
                        continue;
-               dst_pte = huge_pte_alloc(dst, addr, sz);
+               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
                if (!dst_pte) {
                        ret = -ENOMEM;
                        break;
@@ -4395,13 +4417,10 @@ retry:
                         * sure there really is no pte entry.
                         */
                        ptl = huge_pte_lock(h, mm, ptep);
-                       if (!huge_pte_none(huge_ptep_get(ptep))) {
-                               ret = 0;
-                               spin_unlock(ptl);
-                               goto out;
-                       }
+                       ret = 0;
+                       if (huge_pte_none(huge_ptep_get(ptep)))
+                               ret = vmf_error(PTR_ERR(page));
                        spin_unlock(ptl);
-                       ret = vmf_error(PTR_ERR(page));
                        goto out;
                }
                clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4563,7 +4582,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         */
        mapping = vma->vm_file->f_mapping;
        i_mmap_lock_read(mapping);
-       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
        if (!ptep) {
                i_mmap_unlock_read(mapping);
                return VM_FAULT_OOM;
@@ -4996,14 +5015,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
        return i ? i : err;
 }
 
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
-#endif
-
 unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                unsigned long address, unsigned long end, pgprot_t newprot)
 {
@@ -5280,6 +5291,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
        /*
         * If the subpool has a minimum size, the number of global
         * reservations to be released may be adjusted.
+        *
+        * Note that !resv_map implies freed == 0. So (chg - freed)
+        * won't go negative.
         */
        gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
        hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5340,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
        return false;
 }
 
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
+       return vma_shareable(vma, addr);
+}
+
 /*
  * Determine if start,end range within vma could be mapped by shared pmd.
  * If yes, adjust start and end to cover range associated with possible
@@ -5370,9 +5393,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
  * only required for subsequent processing.
  */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
-       struct vm_area_struct *vma = find_vma(mm, addr);
        struct address_space *mapping = vma->vm_file->f_mapping;
        pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                        vma->vm_pgoff;
@@ -5382,9 +5405,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        pte_t *pte;
        spinlock_t *ptl;
 
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
-
        i_mmap_assert_locked(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
@@ -5448,9 +5468,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
        return 1;
 }
-#define want_pmd_share()       (1)
+
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
 {
        return NULL;
 }
@@ -5465,11 +5486,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
 {
 }
-#define want_pmd_share()       (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+       return false;
+}
 #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
 
 #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long addr, unsigned long sz)
 {
        pgd_t *pgd;
@@ -5487,8 +5512,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                        pte = (pte_t *)pud;
                } else {
                        BUG_ON(sz != PMD_SIZE);
-                       if (want_pmd_share() && pud_none(*pud))
-                               pte = huge_pmd_share(mm, addr, pud);
+                       if (want_pmd_share(vma, addr) && pud_none(*pud))
+                               pte = huge_pmd_share(mm, vma, addr, pud);
                        else
                                pte = (pte_t *)pmd_alloc(mm, pud, addr);
                }
@@ -5679,6 +5704,12 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                SetHPageTemporary(oldpage);
                ClearHPageTemporary(newpage);
 
+               /*
+                * There is no need to transfer the per-node surplus state
+                * when we do not cross the node.
+                */
+               if (new_nid == old_nid)
+                       return;
                spin_lock(&hugetlb_lock);
                if (h->surplus_huge_pages_node[old_nid]) {
                        h->surplus_huge_pages_node[old_nid]--;
@@ -5688,6 +5719,57 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
        }
 }
 
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_notifier_range range;
+       unsigned long address, start, end;
+       spinlock_t *ptl;
+       pte_t *ptep;
+
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       start = ALIGN(vma->vm_start, PUD_SIZE);
+       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return;
+
+       /*
+        * No need to call adjust_range_if_pmd_sharing_possible(), because
+        * we have already done the PUD_SIZE alignment.
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       for (address = start; address < end; address += PUD_SIZE) {
+               unsigned long tmp = address;
+
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+                       continue;
+               ptl = huge_pte_lock(h, mm, ptep);
+               /* We don't want 'address' to be changed */
+               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               spin_unlock(ptl);
+       }
+       flush_hugetlb_tlb_range(vma, start, end);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/vm/mmu_notifier.rst.
+        */
+       mmu_notifier_invalidate_range_end(&range);
+}
+
 #ifdef CONFIG_CMA
 static bool cma_reserve_called __initdata;