Merge tag 'nfs-for-5.13-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
[linux-2.6-microblaze.git] / mm / hugetlb.c
index 47f56e2..3db405d 100644 (file)
@@ -39,7 +39,6 @@
 #include <linux/hugetlb.h>
 #include <linux/hugetlb_cgroup.h>
 #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
 #include <linux/page_owner.h>
 #include "internal.h"
 
@@ -467,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
                              resv->region_cache_count;
 
                /* At this point, we should have enough entries in the cache
-                * for all the existings adds_in_progress. We should only be
+                * for all the existing adds_in_progress. We should only be
                 * needing to allocate for regions_needed.
                 */
                VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -1069,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
 static void enqueue_huge_page(struct hstate *h, struct page *page)
 {
        int nid = page_to_nid(page);
+
+       lockdep_assert_held(&hugetlb_lock);
        list_move(&page->lru, &h->hugepage_freelists[nid]);
        h->free_huge_pages++;
        h->free_huge_pages_node[nid]++;
@@ -1078,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
 static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
 {
        struct page *page;
-       bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+       bool pin = !!(current->flags & PF_MEMALLOC_PIN);
 
+       lockdep_assert_held(&hugetlb_lock);
        list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (nocma && is_migrate_cma_page(page))
+               if (pin && !is_pinnable_page(page))
                        continue;
 
                if (PageHWPoison(page))
@@ -1351,6 +1353,7 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
 
+       lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
@@ -1480,17 +1483,30 @@ void free_huge_page(struct page *page)
        }
 }
 
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+/*
+ * Must be called with the hugetlb lock held
+ */
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
+{
+       lockdep_assert_held(&hugetlb_lock);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
+}
+
+static void __prep_new_huge_page(struct page *page)
 {
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
        set_hugetlb_cgroup(page, NULL);
        set_hugetlb_cgroup_rsvd(page, NULL);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+       __prep_new_huge_page(page);
        spin_lock_irq(&hugetlb_lock);
-       h->nr_huge_pages++;
-       h->nr_huge_pages_node[nid]++;
-       ClearHPageFreed(page);
+       __prep_account_new_huge_page(h, nid);
        spin_unlock_irq(&hugetlb_lock);
 }
 
@@ -1701,6 +1717,7 @@ static struct page *remove_pool_huge_page(struct hstate *h,
        int nr_nodes, node;
        struct page *page = NULL;
 
+       lockdep_assert_held(&hugetlb_lock);
        for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                /*
                 * If we're returning unused surplus pages, only examine
@@ -1950,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
        long needed, allocated;
        bool alloc_ok = true;
 
+       lockdep_assert_held(&hugetlb_lock);
        needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
        if (needed <= 0) {
                h->resv_huge_pages += delta;
@@ -2043,6 +2061,7 @@ static void return_unused_surplus_pages(struct hstate *h,
        struct page *page;
        LIST_HEAD(page_list);
 
+       lockdep_assert_held(&hugetlb_lock);
        /* Uncommit the reservation */
        h->resv_huge_pages -= unused_resv_pages;
 
@@ -2247,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h,
        }
 }
 
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+                                       struct list_head *list)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+       int nid = page_to_nid(old_page);
+       struct page *new_page;
+       int ret = 0;
+
+       /*
+        * Before dissolving the page, we need to allocate a new one for the
+        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+        * not having to deal with prep_new_huge_page() and avoids dealing of any
+        * counters. This simplifies and let us do the whole thing under the
+        * lock.
+        */
+       new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+       if (!new_page)
+               return -ENOMEM;
+
+retry:
+       spin_lock_irq(&hugetlb_lock);
+       if (!PageHuge(old_page)) {
+               /*
+                * Freed from under us. Drop new_page too.
+                */
+               goto free_new;
+       } else if (page_count(old_page)) {
+               /*
+                * Someone has grabbed the page, try to isolate it here.
+                * Fail with -EBUSY if not possible.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               if (!isolate_huge_page(old_page, list))
+                       ret = -EBUSY;
+               spin_lock_irq(&hugetlb_lock);
+               goto free_new;
+       } else if (!HPageFreed(old_page)) {
+               /*
+                * Page's refcount is 0 but it has not been enqueued in the
+                * freelist yet. Race window is small, so we can succeed here if
+                * we retry.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               cond_resched();
+               goto retry;
+       } else {
+               /*
+                * Ok, old_page is still a genuine free hugepage. Remove it from
+                * the freelist and decrease the counters. These will be
+                * incremented again when calling __prep_account_new_huge_page()
+                * and enqueue_huge_page() for new_page. The counters will remain
+                * stable since this happens under the lock.
+                */
+               remove_hugetlb_page(h, old_page, false);
+
+               /*
+                * new_page needs to be initialized with the standard hugetlb
+                * state. This is normally done by prep_new_huge_page() but
+                * that takes hugetlb_lock which is already held so we need to
+                * open code it here.
+                * Reference count trick is needed because allocator gives us
+                * referenced page but the pool requires pages with 0 refcount.
+                */
+               __prep_new_huge_page(new_page);
+               __prep_account_new_huge_page(h, nid);
+               page_ref_dec(new_page);
+               enqueue_huge_page(h, new_page);
+
+               /*
+                * Pages have been replaced, we can safely free the old one.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               update_and_free_page(h, old_page);
+       }
+
+       return ret;
+
+free_new:
+       spin_unlock_irq(&hugetlb_lock);
+       __free_pages(new_page, huge_page_order(h));
+
+       return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+       struct hstate *h;
+       struct page *head;
+       int ret = -EBUSY;
+
+       /*
+        * The page might have been dissolved from under our feet, so make sure
+        * to carefully check the state under the lock.
+        * Return success when racing as if we dissolved the page ourselves.
+        */
+       spin_lock_irq(&hugetlb_lock);
+       if (PageHuge(page)) {
+               head = compound_head(page);
+               h = page_hstate(head);
+       } else {
+               spin_unlock_irq(&hugetlb_lock);
+               return 0;
+       }
+       spin_unlock_irq(&hugetlb_lock);
+
+       /*
+        * Fence off gigantic pages as there is a cyclic dependency between
+        * alloc_contig_range and them. Return -ENOMEM as this has the effect
+        * of bailing out right away without further retrying.
+        */
+       if (hstate_is_gigantic(h))
+               return -ENOMEM;
+
+       if (page_count(head) && isolate_huge_page(head, list))
+               ret = 0;
+       else if (!page_count(head))
+               ret = alloc_and_dissolve_huge_page(h, head, list);
+
+       return ret;
+}
+
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
@@ -2530,6 +2677,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
        int i;
        LIST_HEAD(page_list);
 
+       lockdep_assert_held(&hugetlb_lock);
        if (hstate_is_gigantic(h))
                return;
 
@@ -2571,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
 {
        int nr_nodes, node;
 
+       lockdep_assert_held(&hugetlb_lock);
        VM_BUG_ON(delta != -1 && delta != 1);
 
        if (delta < 0) {
@@ -4319,6 +4468,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
        return 0;
 }
 
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+                                                 struct address_space *mapping,
+                                                 pgoff_t idx,
+                                                 unsigned int flags,
+                                                 unsigned long haddr,
+                                                 unsigned long reason)
+{
+       vm_fault_t ret;
+       u32 hash;
+       struct vm_fault vmf = {
+               .vma = vma,
+               .address = haddr,
+               .flags = flags,
+
+               /*
+                * Hard to debug if it ends up being
+                * used by a callee that assumes
+                * something about the other
+                * uninitialized fields... same as in
+                * memory.c
+                */
+       };
+
+       /*
+        * hugetlb_fault_mutex and i_mmap_rwsem must be
+        * dropped before handling userfault.  Reacquire
+        * after handling fault to make calling code simpler.
+        */
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
+       ret = handle_userfault(&vmf, reason);
+       i_mmap_lock_read(mapping);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       return ret;
+}
+
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                        struct vm_area_struct *vma,
                        struct address_space *mapping, pgoff_t idx,
@@ -4357,35 +4544,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-               /*
-                * Check for page in userfault range
-                */
+               /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
-                       u32 hash;
-                       struct vm_fault vmf = {
-                               .vma = vma,
-                               .address = haddr,
-                               .flags = flags,
-                               /*
-                                * Hard to debug if it ends up being
-                                * used by a callee that assumes
-                                * something about the other
-                                * uninitialized fields... same as in
-                                * memory.c
-                                */
-                       };
-
-                       /*
-                        * hugetlb_fault_mutex and i_mmap_rwsem must be
-                        * dropped before handling userfault.  Reacquire
-                        * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(mapping, idx);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-                       i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MISSING);
                        goto out;
                }
 
@@ -4441,6 +4604,16 @@ retry:
                                VM_FAULT_SET_HINDEX(hstate_index(h));
                        goto backout_unlocked;
                }
+
+               /* Check for page in userfault range. */
+               if (userfaultfd_minor(vma)) {
+                       unlock_page(page);
+                       put_page(page);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MINOR);
+                       goto out;
+               }
        }
 
        /*
@@ -4681,6 +4854,7 @@ out_mutex:
        return ret;
 }
 
+#ifdef CONFIG_USERFAULTFD
 /*
  * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
  * modifications for huge pages.
@@ -4690,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct vm_area_struct *dst_vma,
                            unsigned long dst_addr,
                            unsigned long src_addr,
+                           enum mcopy_atomic_mode mode,
                            struct page **pagep)
 {
+       bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
        struct address_space *mapping;
        pgoff_t idx;
        unsigned long size;
@@ -4701,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        spinlock_t *ptl;
        int ret;
        struct page *page;
+       int writable;
 
-       if (!*pagep) {
+       mapping = dst_vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+
+       if (is_continue) {
+               ret = -EFAULT;
+               page = find_lock_page(mapping, idx);
+               if (!page)
+                       goto out;
+       } else if (!*pagep) {
                ret = -ENOMEM;
                page = alloc_huge_page(dst_vma, dst_addr, 0);
                if (IS_ERR(page))
@@ -4731,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         */
        __SetPageUptodate(page);
 
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-       /*
-        * If shared, add to page cache
-        */
-       if (vm_shared) {
+       /* Add shared, newly allocated pages to the page cache. */
+       if (vm_shared && !is_continue) {
                size = i_size_read(mapping->host) >> huge_page_shift(h);
                ret = -EFAULT;
                if (idx >= size)
@@ -4782,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
        }
 
-       _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-       if (dst_vma->vm_flags & VM_WRITE)
+       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+       if (is_continue && !vm_shared)
+               writable = 0;
+       else
+               writable = dst_vma->vm_flags & VM_WRITE;
+
+       _dst_pte = make_huge_pte(dst_vma, page, writable);
+       if (writable)
                _dst_pte = huge_pte_mkdirty(_dst_pte);
        _dst_pte = pte_mkyoung(_dst_pte);
 
@@ -4797,20 +4983,22 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
        update_mmu_cache(dst_vma, dst_addr, dst_pte);
 
        spin_unlock(ptl);
-       SetHPageMigratable(page);
-       if (vm_shared)
+       if (!is_continue)
+               SetHPageMigratable(page);
+       if (vm_shared || is_continue)
                unlock_page(page);
        ret = 0;
 out:
        return ret;
 out_release_unlock:
        spin_unlock(ptl);
-       if (vm_shared)
+       if (vm_shared || is_continue)
                unlock_page(page);
 out_release_nounlock:
        put_page(page);
        goto out;
 }
+#endif /* CONFIG_USERFAULTFD */
 
 static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
                                 int refs, struct page **pages,
@@ -5348,8 +5536,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
 
        /*
-        * vma need span at least one aligned PUD size and the start,end range
-        * must at least partialy within it.
+        * vma needs to span at least one aligned PUD size, and the range
+        * must be at least partially within in.
         */
        if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                (*end <= v_start) || (*start >= v_end))