Merge tag 'loongarch-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai...
[linux-2.6-microblaze.git] / mm / hugetlb.c
index 0bdfc7e..0ad53ad 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/migrate.h>
 #include <linux/nospec.h>
 #include <linux/delayacct.h>
+#include <linux/memory.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
 
 /* Forward declaration */
 static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
 
 static inline bool subpool_is_free(struct hugepage_subpool *spool)
 {
@@ -257,7 +261,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
 static struct file_region *
 get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
 {
-       struct file_region *nrg = NULL;
+       struct file_region *nrg;
 
        VM_BUG_ON(resv->region_cache_count <= 0);
 
@@ -339,7 +343,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
 
 static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
 {
-       struct file_region *nrg = NULL, *prg = NULL;
+       struct file_region *nrg, *prg;
 
        prg = list_prev_entry(rg, link);
        if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -456,14 +460,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
                                        int regions_needed)
        __must_hold(&resv->lock)
 {
-       struct list_head allocated_regions;
+       LIST_HEAD(allocated_regions);
        int to_allocate = 0, i = 0;
        struct file_region *trg = NULL, *rg = NULL;
 
        VM_BUG_ON(regions_needed < 0);
 
-       INIT_LIST_HEAD(&allocated_regions);
-
        /*
         * Check for sufficient descriptors in the cache to accommodate
         * the number of in progress add operations plus regions_needed.
@@ -860,7 +862,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
  * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
  * is guaranteed to have their future faults succeed.
  *
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
  * the reserve counters are updated with the hugetlb_lock held. It is safe
  * to reset the VMA at fork() time as it is not in use yet and there is no
  * chance of the global counters getting corrupted as a result of the values.
@@ -1007,12 +1009,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
        return (get_vma_private_data(vma) & flag) != 0;
 }
 
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
 {
        VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+       /*
+        * Clear vm_private_data
+        * - For MAP_PRIVATE mappings, this is the reserve map which does
+        *   not apply to children.  Faults generated by the children are
+        *   not guaranteed to succeed, even if read-only.
+        * - For shared mappings this is a per-vma semaphore that may be
+        *   allocated in a subsequent call to hugetlb_vm_op_open.
+        */
+       vma->vm_private_data = (void *)0;
        if (!(vma->vm_flags & VM_MAYSHARE))
-               vma->vm_private_data = (void *)0;
+               return;
 }
 
 /*
@@ -1043,7 +1053,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
                kref_put(&reservations->refs, resv_map_release);
        }
 
-       reset_vma_resv_huge_pages(vma);
+       hugetlb_dup_vma_private(vma);
 }
 
 /* Returns true if the VMA has associated reserve pages */
@@ -1182,6 +1192,11 @@ retry_cpuset:
        return NULL;
 }
 
+static unsigned long available_huge_pages(struct hstate *h)
+{
+       return h->free_huge_pages - h->resv_huge_pages;
+}
+
 static struct page *dequeue_huge_page_vma(struct hstate *h,
                                struct vm_area_struct *vma,
                                unsigned long address, int avoid_reserve,
@@ -1198,12 +1213,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
         * have no page reserves. This check ensures that reservations are
         * not "stolen". The child may still get SIGKILLed
         */
-       if (!vma_has_reserves(vma, chg) &&
-                       h->free_huge_pages - h->resv_huge_pages == 0)
+       if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
                goto err;
 
        /* If reserves cannot be used, ensure enough pages are in the pool */
-       if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+       if (avoid_reserve && !available_huge_pages(h))
                goto err;
 
        gfp_mask = htlb_alloc_mask(h);
@@ -1308,12 +1322,13 @@ static void __destroy_compound_gigantic_page(struct page *page,
 {
        int i;
        int nr_pages = 1 << order;
-       struct page *p = page + 1;
+       struct page *p;
 
        atomic_set(compound_mapcount_ptr(page), 0);
        atomic_set(compound_pincount_ptr(page), 0);
 
-       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+       for (i = 1; i < nr_pages; i++) {
+               p = nth_page(page, i);
                p->mapping = NULL;
                clear_compound_head(p);
                if (!demote)
@@ -1506,6 +1521,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
 
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        set_page_private(page, 0);
+       /*
+        * We have to set HPageVmemmapOptimized again as above
+        * set_page_private(page, 0) cleared it.
+        */
        SetHPageVmemmapOptimized(page);
 
        /*
@@ -1530,7 +1549,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
 static void __update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
-       struct page *subpage = page;
+       struct page *subpage;
 
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
@@ -1561,8 +1580,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
        if (unlikely(PageHWPoison(page)))
                hugetlb_clear_page_hwpoison(page);
 
-       for (i = 0; i < pages_per_huge_page(h);
-            i++, subpage = mem_map_next(subpage, page, i)) {
+       for (i = 0; i < pages_per_huge_page(h); i++) {
+               subpage = nth_page(page, i);
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
                                1 << PG_referenced | 1 << PG_dirty |
                                1 << PG_active | 1 << PG_private |
@@ -1769,13 +1788,14 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
 {
        int i, j;
        int nr_pages = 1 << order;
-       struct page *p = page + 1;
+       struct page *p;
 
        /* we rely on prep_new_huge_page to set the destructor */
        set_compound_order(page, order);
-       __ClearPageReserved(page);
        __SetPageHead(page);
-       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+       for (i = 0; i < nr_pages; i++) {
+               p = nth_page(page, i);
+
                /*
                 * For gigantic hugepages allocated through bootmem at
                 * boot, it's safer to be consistent with the not-gigantic
@@ -1814,22 +1834,26 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
                } else {
                        VM_BUG_ON_PAGE(page_count(p), p);
                }
-               set_compound_head(p, page);
+               if (i != 0)
+                       set_compound_head(p, page);
        }
        atomic_set(compound_mapcount_ptr(page), -1);
        atomic_set(compound_pincount_ptr(page), 0);
        return true;
 
 out_error:
-       /* undo tail page modifications made above */
-       p = page + 1;
-       for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
-               clear_compound_head(p);
+       /* undo page modifications made above */
+       for (j = 0; j < i; j++) {
+               p = nth_page(page, j);
+               if (j != 0)
+                       clear_compound_head(p);
                set_page_refcounted(p);
        }
        /* need to clear PG_reserved on remaining tail pages  */
-       for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+       for (; j < nr_pages; j++) {
+               p = nth_page(page, j);
                __ClearPageReserved(p);
+       }
        set_compound_order(page, 0);
 #ifdef CONFIG_64BIT
        page[1].compound_nr = 0;
@@ -1918,6 +1942,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
        int order = huge_page_order(h);
        struct page *page;
        bool alloc_try_hard = true;
+       bool retry = true;
 
        /*
         * By default we always try hard to allocate the page with
@@ -1933,7 +1958,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                gfp_mask |= __GFP_RETRY_MAYFAIL;
        if (nid == NUMA_NO_NODE)
                nid = numa_mem_id();
+retry:
        page = __alloc_pages(gfp_mask, order, nid, nmask);
+
+       /* Freeze head page */
+       if (page && !page_ref_freeze(page, 1)) {
+               __free_pages(page, order);
+               if (retry) {    /* retry once */
+                       retry = false;
+                       goto retry;
+               }
+               /* WOW!  twice in a row. */
+               pr_warn("HugeTLB head page unexpected inflated ref count\n");
+               page = NULL;
+       }
+
        if (page)
                __count_vm_event(HTLB_BUDDY_PGALLOC);
        else
@@ -1961,6 +2000,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
 /*
  * Common helper to allocate a fresh hugetlb page. All specific allocators
  * should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen':  ref count of head page and all tail
+ * pages is zero.
  */
 static struct page *alloc_fresh_huge_page(struct hstate *h,
                gfp_t gfp_mask, int nid, nodemask_t *nmask,
@@ -2018,7 +2060,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
        if (!page)
                return 0;
 
-       put_page(page); /* free it into the hugepage allocator */
+       free_huge_page(page); /* free it into the hugepage allocator */
 
        return 1;
 }
@@ -2087,7 +2129,7 @@ retry:
        if (!page_count(page)) {
                struct page *head = compound_head(page);
                struct hstate *h = page_hstate(head);
-               if (h->free_huge_pages - h->resv_huge_pages == 0)
+               if (!available_huge_pages(h))
                        goto out;
 
                /*
@@ -2175,10 +2217,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
  * Allocates a fresh surplus page from the page allocator.
  */
 static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
-               int nid, nodemask_t *nmask, bool zero_ref)
+                                               int nid, nodemask_t *nmask)
 {
        struct page *page = NULL;
-       bool retry = false;
 
        if (hstate_is_gigantic(h))
                return NULL;
@@ -2188,7 +2229,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                goto out_unlock;
        spin_unlock_irq(&hugetlb_lock);
 
-retry:
        page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
        if (!page)
                return NULL;
@@ -2204,34 +2244,10 @@ retry:
        if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                SetHPageTemporary(page);
                spin_unlock_irq(&hugetlb_lock);
-               put_page(page);
+               free_huge_page(page);
                return NULL;
        }
 
-       if (zero_ref) {
-               /*
-                * Caller requires a page with zero ref count.
-                * We will drop ref count here.  If someone else is holding
-                * a ref, the page will be freed when they drop it.  Abuse
-                * temporary page flag to accomplish this.
-                */
-               SetHPageTemporary(page);
-               if (!put_page_testzero(page)) {
-                       /*
-                        * Unexpected inflated ref count on freshly allocated
-                        * huge.  Retry once.
-                        */
-                       pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
-                       spin_unlock_irq(&hugetlb_lock);
-                       if (retry)
-                               return NULL;
-
-                       retry = true;
-                       goto retry;
-               }
-               ClearHPageTemporary(page);
-       }
-
        h->surplus_huge_pages++;
        h->surplus_huge_pages_node[page_to_nid(page)]++;
 
@@ -2253,6 +2269,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
        if (!page)
                return NULL;
 
+       /* fresh huge pages are frozen */
+       set_page_refcounted(page);
+
        /*
         * We do not account these pages as surplus because they are only
         * temporary and will be released properly on the last reference
@@ -2280,14 +2299,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                gfp_t gfp = gfp_mask | __GFP_NOWARN;
 
                gfp &=  ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
-               page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+               page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
 
                /* Fallback to all nodes if page==NULL */
                nodemask = NULL;
        }
 
        if (!page)
-               page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+               page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
        mpol_cond_put(mpol);
        return page;
 }
@@ -2297,7 +2316,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                nodemask_t *nmask, gfp_t gfp_mask)
 {
        spin_lock_irq(&hugetlb_lock);
-       if (h->free_huge_pages - h->resv_huge_pages > 0) {
+       if (available_huge_pages(h)) {
                struct page *page;
 
                page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
@@ -2336,7 +2355,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
 static int gather_surplus_pages(struct hstate *h, long delta)
        __must_hold(&hugetlb_lock)
 {
-       struct list_head surplus_list;
+       LIST_HEAD(surplus_list);
        struct page *page, *tmp;
        int ret;
        long i;
@@ -2351,14 +2370,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
        }
 
        allocated = 0;
-       INIT_LIST_HEAD(&surplus_list);
 
        ret = -ENOMEM;
 retry:
        spin_unlock_irq(&hugetlb_lock);
        for (i = 0; i < needed; i++) {
                page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
-                               NUMA_NO_NODE, NULL, true);
+                               NUMA_NO_NODE, NULL);
                if (!page) {
                        alloc_ok = false;
                        break;
@@ -2720,7 +2738,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 {
        gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
        int nid = page_to_nid(old_page);
-       bool alloc_retry = false;
        struct page *new_page;
        int ret = 0;
 
@@ -2731,30 +2748,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
         * the pool.  This simplifies and let us do most of the processing
         * under the lock.
         */
-alloc_retry:
        new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
        if (!new_page)
                return -ENOMEM;
-       /*
-        * If all goes well, this page will be directly added to the free
-        * list in the pool.  For this the ref count needs to be zero.
-        * Attempt to drop now, and retry once if needed.  It is VERY
-        * unlikely there is another ref on the page.
-        *
-        * If someone else has a reference to the page, it will be freed
-        * when they drop their ref.  Abuse temporary page flag to accomplish
-        * this.  Retry once if there is an inflated ref count.
-        */
-       SetHPageTemporary(new_page);
-       if (!put_page_testzero(new_page)) {
-               if (alloc_retry)
-                       return -EBUSY;
-
-               alloc_retry = true;
-               goto alloc_retry;
-       }
-       ClearHPageTemporary(new_page);
-
        __prep_new_huge_page(h, new_page);
 
 retry:
@@ -2934,6 +2930,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                }
                spin_lock_irq(&hugetlb_lock);
                list_add(&page->lru, &h->hugepage_activelist);
+               set_page_refcounted(page);
                /* Fall through */
        }
        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3038,7 +3035,7 @@ static void __init gather_bootmem_prealloc(void)
                if (prep_compound_gigantic_page(page, huge_page_order(h))) {
                        WARN_ON(PageReserved(page));
                        prep_new_huge_page(h, page, page_to_nid(page));
-                       put_page(page); /* add to the hugepage allocator */
+                       free_huge_page(page); /* add to the hugepage allocator */
                } else {
                        /* VERY unlikely inflated ref count on a tail page */
                        free_gigantic_page(page, huge_page_order(h));
@@ -3070,7 +3067,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
                                        &node_states[N_MEMORY], NULL);
                        if (!page)
                                break;
-                       put_page(page); /* free it into the hugepage allocator */
+                       free_huge_page(page); /* free it into the hugepage allocator */
                }
                cond_resched();
        }
@@ -3461,9 +3458,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
                else
                        prep_compound_page(subpage, target_hstate->order);
                set_page_private(subpage, 0);
-               set_page_refcounted(subpage);
                prep_new_huge_page(target_hstate, subpage, nid);
-               put_page(subpage);
+               free_huge_page(subpage);
        }
        mutex_unlock(&target_hstate->resize_lock);
 
@@ -3474,7 +3470,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
         * based on pool changes for the demoted page.
         */
        h->max_huge_pages--;
-       target_hstate->max_huge_pages += pages_per_huge_page(h);
+       target_hstate->max_huge_pages +=
+               pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
 
        return rc;
 }
@@ -3716,7 +3713,7 @@ static ssize_t demote_store(struct kobject *kobj,
        unsigned long nr_available;
        nodemask_t nodes_allowed, *n_mask;
        struct hstate *h;
-       int err = 0;
+       int err;
        int nid;
 
        err = kstrtoul(buf, 10, &nr_demote);
@@ -3767,8 +3764,7 @@ HSTATE_ATTR_WO(demote);
 static ssize_t demote_size_show(struct kobject *kobj,
                                        struct kobj_attribute *attr, char *buf)
 {
-       int nid;
-       struct hstate *h = kobj_to_hstate(kobj, &nid);
+       struct hstate *h = kobj_to_hstate(kobj, NULL);
        unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
 
        return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3781,7 +3777,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
        struct hstate *h, *demote_hstate;
        unsigned long demote_size;
        unsigned int demote_order;
-       int nid;
 
        demote_size = (unsigned long)memparse(buf, NULL);
 
@@ -3793,7 +3788,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
                return -EINVAL;
 
        /* demote order must be smaller than hstate order */
-       h = kobj_to_hstate(kobj, &nid);
+       h = kobj_to_hstate(kobj, NULL);
        if (demote_order >= h->order)
                return -EINVAL;
 
@@ -3847,35 +3842,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
        if (retval) {
                kobject_put(hstate_kobjs[hi]);
                hstate_kobjs[hi] = NULL;
+               return retval;
        }
 
        if (h->demote_order) {
-               if (sysfs_create_group(hstate_kobjs[hi],
-                                       &hstate_demote_attr_group))
+               retval = sysfs_create_group(hstate_kobjs[hi],
+                                           &hstate_demote_attr_group);
+               if (retval) {
                        pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+                       sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+                       kobject_put(hstate_kobjs[hi]);
+                       hstate_kobjs[hi] = NULL;
+                       return retval;
+               }
        }
 
-       return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
-       struct hstate *h;
-       int err;
-
-       hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
-       if (!hugepages_kobj)
-               return;
-
-       for_each_hstate(h) {
-               err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
-                                        hstate_kobjs, &hstate_attr_group);
-               if (err)
-                       pr_err("HugeTLB: Unable to add hstate %s", h->name);
-       }
+       return 0;
 }
 
 #ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
 
 /*
  * node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3931,7 +3917,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
  * Unregister hstate attributes from a single node device.
  * No-op if no hstate attributes attached.
  */
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
 {
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3941,10 +3927,15 @@ static void hugetlb_unregister_node(struct node *node)
 
        for_each_hstate(h) {
                int idx = hstate_index(h);
-               if (nhs->hstate_kobjs[idx]) {
-                       kobject_put(nhs->hstate_kobjs[idx]);
-                       nhs->hstate_kobjs[idx] = NULL;
-               }
+               struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+               if (!hstate_kobj)
+                       continue;
+               if (h->demote_order)
+                       sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+               sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+               kobject_put(hstate_kobj);
+               nhs->hstate_kobjs[idx] = NULL;
        }
 
        kobject_put(nhs->hugepages_kobj);
@@ -3956,12 +3947,15 @@ static void hugetlb_unregister_node(struct node *node)
  * Register hstate attributes for a single node device.
  * No-op if attributes already registered.
  */
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
 {
        struct hstate *h;
        struct node_hstate *nhs = &node_hstates[node->dev.id];
        int err;
 
+       if (!hugetlb_sysfs_initialized)
+               return;
+
        if (nhs->hugepages_kobj)
                return;         /* already allocated */
 
@@ -3992,18 +3986,8 @@ static void __init hugetlb_register_all_nodes(void)
 {
        int nid;
 
-       for_each_node_state(nid, N_MEMORY) {
-               struct node *node = node_devices[nid];
-               if (node->dev.id == nid)
-                       hugetlb_register_node(node);
-       }
-
-       /*
-        * Let the node device driver know we're here so it can
-        * [un]register hstate attributes on node hotplug.
-        */
-       register_hugetlbfs_with_node(hugetlb_register_node,
-                                    hugetlb_unregister_node);
+       for_each_online_node(nid)
+               hugetlb_register_node(node_devices[nid]);
 }
 #else  /* !CONFIG_NUMA */
 
@@ -4019,6 +4003,36 @@ static void hugetlb_register_all_nodes(void) { }
 
 #endif
 
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+       struct hstate *h;
+       int err;
+
+       hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+       if (!hugepages_kobj)
+               return;
+
+       for_each_hstate(h) {
+               err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+                                        hstate_kobjs, &hstate_attr_group);
+               if (err)
+                       pr_err("HugeTLB: Unable to add hstate %s", h->name);
+       }
+
+#ifdef CONFIG_NUMA
+       hugetlb_sysfs_initialized = true;
+#endif
+       hugetlb_register_all_nodes();
+}
+
 static int __init hugetlb_init(void)
 {
        int i;
@@ -4073,7 +4087,6 @@ static int __init hugetlb_init(void)
        report_hugepages();
 
        hugetlb_sysfs_init();
-       hugetlb_register_all_nodes();
        hugetlb_cgroup_file_init();
 
 #ifdef CONFIG_SMP
@@ -4118,7 +4131,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        h->next_nid_to_alloc = first_memory_node;
        h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
-                                       huge_page_size(h)/1024);
+                                       huge_page_size(h)/SZ_1K);
 
        parsed_hstate = h;
 }
@@ -4133,11 +4146,11 @@ static void __init hugepages_clear_pages_in_node(void)
        if (!hugetlb_max_hstate) {
                default_hstate_max_huge_pages = 0;
                memset(default_hugepages_in_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(default_hugepages_in_node));
        } else {
                parsed_hstate->max_huge_pages = 0;
                memset(parsed_hstate->max_huge_pages_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(parsed_hstate->max_huge_pages_node));
        }
 }
 
@@ -4332,18 +4345,34 @@ static int __init default_hugepagesz_setup(char *s)
 }
 __setup("default_hugepagesz=", default_hugepagesz_setup);
 
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+       struct mempolicy *mpol = get_task_policy(current);
+
+       /*
+        * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+        * (from policy_nodemask) specifically for hugetlb case
+        */
+       if (mpol->mode == MPOL_BIND &&
+               (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+                cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+               return &mpol->nodes;
+#endif
+       return NULL;
+}
+
 static unsigned int allowed_mems_nr(struct hstate *h)
 {
        int node;
        unsigned int nr = 0;
-       nodemask_t *mpol_allowed;
+       nodemask_t *mbind_nodemask;
        unsigned int *array = h->free_huge_pages_node;
        gfp_t gfp_mask = htlb_alloc_mask(h);
 
-       mpol_allowed = policy_nodemask_current(gfp_mask);
-
+       mbind_nodemask = policy_mbind_nodemask(gfp_mask);
        for_each_node_mask(node, cpuset_current_mems_allowed) {
-               if (!mpol_allowed || node_isset(node, *mpol_allowed))
+               if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
                        nr += array[node];
        }
 
@@ -4583,16 +4612,28 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                kref_get(&resv->refs);
        }
+
+       /*
+        * vma_lock structure for sharable mappings is vma specific.
+        * Clear old pointer (if copied via vm_area_dup) and create new.
+        */
+       if (vma->vm_flags & VM_MAYSHARE) {
+               vma->vm_private_data = NULL;
+               hugetlb_vma_lock_alloc(vma);
+       }
 }
 
 static void hugetlb_vm_op_close(struct vm_area_struct *vma)
 {
        struct hstate *h = hstate_vma(vma);
-       struct resv_map *resv = vma_resv_map(vma);
+       struct resv_map *resv;
        struct hugepage_subpool *spool = subpool_vma(vma);
        unsigned long reserve, start, end;
        long gbl_reserve;
 
+       hugetlb_vma_lock_free(vma);
+
+       resv = vma_resv_map(vma);
        if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                return;
 
@@ -4723,14 +4764,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *dst_vma,
                            struct vm_area_struct *src_vma)
 {
-       pte_t *src_pte, *dst_pte, entry, dst_entry;
+       pte_t *src_pte, *dst_pte, entry;
        struct page *ptepage;
        unsigned long addr;
        bool cow = is_cow_mapping(src_vma->vm_flags);
        struct hstate *h = hstate_vma(src_vma);
        unsigned long sz = huge_page_size(h);
        unsigned long npages = pages_per_huge_page(h);
-       struct address_space *mapping = src_vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        unsigned long last_addr_mask;
        int ret = 0;
@@ -4744,12 +4784,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                raw_write_seqcount_begin(&src->write_protect_seq);
        } else {
                /*
-                * For shared mappings i_mmap_rwsem must be held to call
-                * huge_pte_alloc, otherwise the returned ptep could go
-                * away if part of a shared pmd and another thread calls
-                * huge_pmd_unshare.
+                * For shared mappings the vma lock must be held before
+                * calling huge_pte_offset in the src vma. Otherwise, the
+                * returned ptep could go away if part of a shared pmd and
+                * another thread calls huge_pmd_unshare.
                 */
-               i_mmap_lock_read(mapping);
+               hugetlb_vma_lock_read(src_vma);
        }
 
        last_addr_mask = hugetlb_mask_last_page(h);
@@ -4768,15 +4808,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
                /*
                 * If the pagetables are shared don't copy or take references.
-                * dst_pte == src_pte is the common case of src/dest sharing.
                 *
+                * dst_pte == src_pte is the common case of src/dest sharing.
                 * However, src could have 'unshared' and dst shares with
-                * another vma.  If dst_pte !none, this implies sharing.
-                * Check here before taking page table lock, and once again
-                * after taking the lock below.
+                * another vma. So page_count of ptep page is checked instead
+                * to reliably determine whether pte is shared.
                 */
-               dst_entry = huge_ptep_get(dst_pte);
-               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+               if (page_count(virt_to_page(dst_pte)) > 1) {
                        addr |= last_addr_mask;
                        continue;
                }
@@ -4785,13 +4823,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                src_ptl = huge_pte_lockptr(h, src, src_pte);
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_pte);
-               dst_entry = huge_ptep_get(dst_pte);
 again:
-               if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+               if (huge_pte_none(entry)) {
                        /*
-                        * Skip if src entry none.  Also, skip in the
-                        * unlikely case dst entry !none as this implies
-                        * sharing with another vma.
+                        * Skip if src entry none.
                         */
                        ;
                } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4870,7 +4905,7 @@ again:
                                        restore_reserve_on_error(h, dst_vma, addr,
                                                                new);
                                        put_page(new);
-                                       /* dst_entry won't change as in child */
+                                       /* huge_ptep of dst_pte won't change as in child */
                                        goto again;
                                }
                                hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -4902,7 +4937,7 @@ again:
                raw_write_seqcount_end(&src->write_protect_seq);
                mmu_notifier_invalidate_range_end(&range);
        } else {
-               i_mmap_unlock_read(mapping);
+               hugetlb_vma_unlock_read(src_vma);
        }
 
        return ret;
@@ -4961,6 +4996,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
        mmu_notifier_invalidate_range_start(&range);
        last_addr_mask = hugetlb_mask_last_page(h);
        /* Prevent race with file truncation */
+       hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(mapping);
        for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -4992,6 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                flush_tlb_range(vma, old_end - len, old_end);
        mmu_notifier_invalidate_range_end(&range);
        i_mmap_unlock_write(mapping);
+       hugetlb_vma_unlock_write(vma);
 
        return len + old_addr - old_end;
 }
@@ -5139,19 +5176,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                          unsigned long end, struct page *ref_page,
                          zap_flags_t zap_flags)
 {
+       hugetlb_vma_lock_write(vma);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
        __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
 
        /*
-        * Clear this flag so that x86's huge_pmd_share page_table_shareable
-        * test will fail on a vma being torn down, and not grab a page table
-        * on its way out.  We're lucky that the flag has such an appropriate
-        * name, and can in fact be safely cleared here. We could clear it
-        * before the __unmap_hugepage_range above, but all that's necessary
-        * is to clear it before releasing the i_mmap_rwsem. This works
-        * because in the context this is called, the VMA is about to be
-        * destroyed and the i_mmap_rwsem is held.
+        * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
+        * the vma_lock is freed, this makes the vma ineligible for pmd
+        * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
+        * This is important as page tables for this unmapped range will
+        * be asynchrously deleted.  If the page tables are shared, there
+        * will be issues when accessed by someone else.
         */
-       vma->vm_flags &= ~VM_MAYSHARE;
+       __hugetlb_vma_unlock_write_free(vma);
+
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
 }
 
 void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
@@ -5316,11 +5356,10 @@ retry_avoidcopy:
                        u32 hash;
 
                        put_page(old_page);
-                       BUG_ON(huge_pte_none(pte));
                        /*
-                        * Drop hugetlb_fault_mutex and i_mmap_rwsem before
-                        * unmapping.  unmapping needs to hold i_mmap_rwsem
-                        * in write mode.  Dropping i_mmap_rwsem in read mode
+                        * Drop hugetlb_fault_mutex and vma_lock before
+                        * unmapping.  unmapping needs to hold vma_lock
+                        * in write mode.  Dropping vma_lock in read mode
                         * here is OK as COW mappings do not interact with
                         * PMD sharing.
                         *
@@ -5328,13 +5367,13 @@ retry_avoidcopy:
                         */
                        idx = vma_hugecache_offset(h, vma, haddr);
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
+                       hugetlb_vma_unlock_read(vma);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
 
                        unmap_ref_private(mm, vma, old_page, haddr);
 
-                       i_mmap_lock_read(mapping);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_vma_lock_read(vma);
                        spin_lock(ptl);
                        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
                        if (likely(ptep &&
@@ -5408,19 +5447,6 @@ out_release_old:
        return ret;
 }
 
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
-{
-       struct address_space *mapping;
-       pgoff_t idx;
-
-       mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, address);
-
-       return find_lock_page(mapping, idx);
-}
-
 /*
  * Return whether there is a pagecache page to back given address within VMA.
  * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5441,7 +5467,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
        return page != NULL;
 }
 
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
                           pgoff_t idx)
 {
        struct folio *folio = page_folio(page);
@@ -5478,7 +5504,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
                                                  unsigned long addr,
                                                  unsigned long reason)
 {
-       vm_fault_t ret;
        u32 hash;
        struct vm_fault vmf = {
                .vma = vma,
@@ -5496,18 +5521,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
        };
 
        /*
-        * hugetlb_fault_mutex and i_mmap_rwsem must be
-        * dropped before handling userfault.  Reacquire
-        * after handling fault to make calling code simpler.
+        * vma_lock and hugetlb_fault_mutex must be dropped before handling
+        * userfault. Also mmap_lock could be dropped due to handling
+        * userfault, any vma operation should be careful from here.
         */
+       hugetlb_vma_unlock_read(vma);
        hash = hugetlb_fault_mutex_hash(mapping, idx);
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-       i_mmap_unlock_read(mapping);
-       ret = handle_userfault(&vmf, reason);
-       i_mmap_lock_read(mapping);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-       return ret;
+       return handle_userfault(&vmf, reason);
 }
 
 static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -5525,6 +5546,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        spinlock_t *ptl;
        unsigned long haddr = address & huge_page_mask(h);
        bool new_page, new_pagecache_page = false;
+       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
 
        /*
         * Currently, we are forced to kill the process in the event the
@@ -5535,29 +5557,24 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                           current->pid);
-               return ret;
+               goto out;
        }
 
        /*
-        * We can not race with truncation due to holding i_mmap_rwsem.
-        * i_size is modified when holding i_mmap_rwsem, so check here
-        * once for faults beyond end of file.
+        * Use page lock to guard against racing truncation
+        * before we get page_table_lock.
         */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto out;
-
-retry:
        new_page = false;
        page = find_lock_page(mapping, idx);
        if (!page) {
+               size = i_size_read(mapping->host) >> huge_page_shift(h);
+               if (idx >= size)
+                       goto out;
                /* Check for page in userfault range */
-               if (userfaultfd_missing(vma)) {
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+               if (userfaultfd_missing(vma))
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                       flags, haddr, address,
                                                       VM_UFFD_MISSING);
-                       goto out;
-               }
 
                page = alloc_huge_page(vma, haddr, 0);
                if (IS_ERR(page)) {
@@ -5585,11 +5602,17 @@ retry:
                new_page = true;
 
                if (vma->vm_flags & VM_MAYSHARE) {
-                       int err = huge_add_to_page_cache(page, mapping, idx);
+                       int err = hugetlb_add_to_page_cache(page, mapping, idx);
                        if (err) {
+                               /*
+                                * err can't be -EEXIST which implies someone
+                                * else consumed the reservation since hugetlb
+                                * fault mutex is held when add a hugetlb page
+                                * to the page cache. So it's safe to call
+                                * restore_reserve_on_error() here.
+                                */
+                               restore_reserve_on_error(h, vma, haddr, page);
                                put_page(page);
-                               if (err == -EEXIST)
-                                       goto retry;
                                goto out;
                        }
                        new_pagecache_page = true;
@@ -5617,10 +5640,9 @@ retry:
                if (userfaultfd_minor(vma)) {
                        unlock_page(page);
                        put_page(page);
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                       flags, haddr, address,
                                                       VM_UFFD_MINOR);
-                       goto out;
                }
        }
 
@@ -5678,15 +5700,17 @@ retry:
 
        unlock_page(page);
 out:
+       hugetlb_vma_unlock_read(vma);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
        return ret;
 
 backout:
        spin_unlock(ptl);
 backout_unlocked:
-       unlock_page(page);
-       /* restore reserve for newly allocated pages not in page cache */
        if (new_page && !new_pagecache_page)
                restore_reserve_on_error(h, vma, haddr, page);
+
+       unlock_page(page);
        put_page(page);
        goto out;
 }
@@ -5747,40 +5771,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
 
        /*
-        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-        * until finished with ptep.  This serves two purposes:
-        * 1) It prevents huge_pmd_unshare from being called elsewhere
-        *    and making the ptep no longer valid.
-        * 2) It synchronizes us with i_size modifications during truncation.
+        * Serialize hugepage allocation and instantiation, so that we don't
+        * get spurious allocation failures if two CPUs race to instantiate
+        * the same page in the page cache.
+        */
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, vma, haddr);
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       /*
+        * Acquire vma lock before calling huge_pte_alloc and hold
+        * until finished with ptep.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the ptep no longer valid.
         *
         * ptep could have already be assigned via huge_pte_offset.  That
         * is OK, as huge_pte_alloc will return the same value unless
         * something has changed.
         */
-       mapping = vma->vm_file->f_mapping;
-       i_mmap_lock_read(mapping);
+       hugetlb_vma_lock_read(vma);
        ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
        if (!ptep) {
-               i_mmap_unlock_read(mapping);
+               hugetlb_vma_unlock_read(vma);
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                return VM_FAULT_OOM;
        }
 
-       /*
-        * Serialize hugepage allocation and instantiation, so that we don't
-        * get spurious allocation failures if two CPUs race to instantiate
-        * the same page in the page cache.
-        */
-       idx = vma_hugecache_offset(h, vma, haddr);
-       hash = hugetlb_fault_mutex_hash(mapping, idx);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
        entry = huge_ptep_get(ptep);
        /* PTE markers should be handled the same way as none pte */
-       if (huge_pte_none_mostly(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+       if (huge_pte_none_mostly(entry))
+               /*
+                * hugetlb_no_page will drop vma lock and hugetlb fault
+                * mutex internally, which make us return immediately.
+                */
+               return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
                                      entry, flags);
-               goto out_mutex;
-       }
 
        ret = 0;
 
@@ -5810,7 +5835,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                /* Just decrements count, does not deallocate */
                vma_end_reservation(h, vma, haddr);
 
-               pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+               pagecache_page = find_lock_page(mapping, idx);
        }
 
        ptl = huge_pte_lock(h, mm, ptep);
@@ -5834,8 +5859,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        unlock_page(pagecache_page);
                        put_page(pagecache_page);
                }
+               hugetlb_vma_unlock_read(vma);
                mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-               i_mmap_unlock_read(mapping);
                return handle_userfault(&vmf, VM_UFFD_WP);
        }
 
@@ -5878,8 +5903,8 @@ out_ptl:
                put_page(pagecache_page);
        }
 out_mutex:
+       hugetlb_vma_unlock_read(vma);
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-       i_mmap_unlock_read(mapping);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -6007,39 +6032,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
 
                /*
                 * Serialization between remove_inode_hugepages() and
-                * huge_add_to_page_cache() below happens through the
+                * hugetlb_add_to_page_cache() below happens through the
                 * hugetlb_fault_mutex_table that here must be hold by
                 * the caller.
                 */
-               ret = huge_add_to_page_cache(page, mapping, idx);
+               ret = hugetlb_add_to_page_cache(page, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
                page_in_pagecache = true;
        }
 
-       ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(h, dst_mm, dst_pte);
 
-       /*
-        * Recheck the i_size after holding PT lock to make sure not
-        * to leave any page mapped (as page_mapped()) beyond the end
-        * of the i_size (remove_inode_hugepages() is strict about
-        * enforcing that). If we bail out here, we'll also leave a
-        * page in the radix tree in the vm_shared case beyond the end
-        * of the i_size, but remove_inode_hugepages() will take care
-        * of it as soon as we drop the hugetlb_fault_mutex_table.
-        */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       ret = -EFAULT;
-       if (idx >= size)
-               goto out_release_unlock;
-
-       ret = -EEXIST;
        /*
         * We allow to overwrite a pte marker: consider when both MISSING|WP
         * registered, we firstly wr-protect a none pte which has no page cache
         * page backing it, then access the page.
         */
+       ret = -EEXIST;
        if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                goto out_release_unlock;
 
@@ -6107,7 +6117,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
 
        for (nr = 0; nr < refs; nr++) {
                if (likely(pages))
-                       pages[nr] = mem_map_offset(page, nr);
+                       pages[nr] = nth_page(page, nr);
                if (vmas)
                        vmas[nr] = vma;
        }
@@ -6271,7 +6281,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                    (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 
                if (pages || vmas)
-                       record_subpages_vmas(mem_map_offset(page, pfn_offset),
+                       record_subpages_vmas(nth_page(page, pfn_offset),
                                             vma, refs,
                                             likely(pages) ? pages + i : NULL,
                                             vmas ? vmas + i : NULL);
@@ -6342,8 +6352,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
        flush_cache_range(vma, range.start, range.end);
 
        mmu_notifier_invalidate_range_start(&range);
-       last_addr_mask = hugetlb_mask_last_page(h);
+       hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
+       last_addr_mask = hugetlb_mask_last_page(h);
        for (; address < end; address += psize) {
                spinlock_t *ptl;
                ptep = huge_pte_offset(mm, address, psize);
@@ -6442,6 +6453,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         * See Documentation/mm/mmu_notifier.rst
         */
        i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
        mmu_notifier_invalidate_range_end(&range);
 
        return pages << h->order;
@@ -6466,6 +6478,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
                return false;
        }
 
+       /*
+        * vma specific semaphore used for pmd sharing synchronization
+        */
+       hugetlb_vma_lock_alloc(vma);
+
        /*
         * Only apply hugepage reservation if asked. At fault time, an
         * attempt will be made for VM_NORESERVE to allocate a page
@@ -6489,12 +6506,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
                resv_map = inode_resv_map(inode);
 
                chg = region_chg(resv_map, from, to, &regions_needed);
-
        } else {
                /* Private mapping. */
                resv_map = resv_map_alloc();
                if (!resv_map)
-                       return false;
+                       goto out_err;
 
                chg = to - from;
 
@@ -6589,6 +6605,7 @@ out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                            chg * pages_per_huge_page(h), h_cg);
 out_err:
+       hugetlb_vma_lock_free(vma);
        if (!vma || vma->vm_flags & VM_MAYSHARE)
                /* Only call region_abort if the region_chg succeeded but the
                 * region_add failed or didn't run.
@@ -6658,35 +6675,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
        /*
         * match the virtual addresses, permission and the alignment of the
         * page table page.
+        *
+        * Also, vma_lock (vm_private_data) is required for sharing.
         */
        if (pmd_index(addr) != pmd_index(saddr) ||
            vm_flags != svm_flags ||
-           !range_in_vma(svma, sbase, s_end))
+           !range_in_vma(svma, sbase, s_end) ||
+           !svma->vm_private_data)
                return 0;
 
        return saddr;
 }
 
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
-       unsigned long base = addr & PUD_MASK;
-       unsigned long end = base + PUD_SIZE;
-
-       /*
-        * check on proper vm_flags and page table alignment
-        */
-       if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
-               return true;
-       return false;
-}
-
 bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
 {
+       unsigned long start = addr & PUD_MASK;
+       unsigned long end = start + PUD_SIZE;
+
 #ifdef CONFIG_USERFAULTFD
        if (uffd_disable_huge_pmd_share(vma))
                return false;
 #endif
-       return vma_shareable(vma, addr);
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return false;
+       if (!vma->vm_private_data)      /* vma lock required for sharing */
+               return false;
+       if (!range_in_vma(vma, start, end))
+               return false;
+       return true;
 }
 
 /*
@@ -6716,16 +6735,157 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                *end = ALIGN(*end, PUD_SIZE);
 }
 
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+               vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_write(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_write(&vma_lock->rw_sema);
+       }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+       if (!__vma_shareable_flags_pmd(vma))
+               return 1;
+
+       return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               lockdep_assert_held(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+       struct hugetlb_vma_lock *vma_lock = container_of(kref,
+                       struct hugetlb_vma_lock, refs);
+
+       kfree(vma_lock);
+}
+
+void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+       struct vm_area_struct *vma = vma_lock->vma;
+
+       /*
+        * vma_lock structure may or not be released as a result of put,
+        * it certainly will no longer be attached to vma so clear pointer.
+        * Semaphore synchronizes access to vma_lock->vma field.
+        */
+       vma_lock->vma = NULL;
+       vma->vm_private_data = NULL;
+       up_write(&vma_lock->rw_sema);
+       kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               __hugetlb_vma_unlock_write_put(vma_lock);
+       }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+       /*
+        * Only present in sharable vmas.
+        */
+       if (!vma || !__vma_shareable_flags_pmd(vma))
+               return;
+
+       if (vma->vm_private_data) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_write(&vma_lock->rw_sema);
+               __hugetlb_vma_unlock_write_put(vma_lock);
+       }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock;
+
+       /* Only establish in (flags) sharable vmas */
+       if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       /* Should never get here with non-NULL vm_private_data */
+       if (vma->vm_private_data)
+               return;
+
+       vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+       if (!vma_lock) {
+               /*
+                * If we can not allocate structure, then vma can not
+                * participate in pmd sharing.  This is only a possible
+                * performance enhancement and memory saving issue.
+                * However, the lock is also used to synchronize page
+                * faults with truncation.  If the lock is not present,
+                * unlikely races could leave pages in a file past i_size
+                * until the file is removed.  Warn in the unlikely case of
+                * allocation failure.
+                */
+               pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+               return;
+       }
+
+       kref_init(&vma_lock->refs);
+       init_rwsem(&vma_lock->rw_sema);
+       vma_lock->vma = vma;
+       vma->vm_private_data = vma_lock;
+}
+
 /*
  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
  * and returns the corresponding pte. While this is not necessary for the
  * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible.  For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space.  This is important as we
- * are setting up sharing based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
  */
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
@@ -6739,7 +6899,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
        pte_t *pte;
        spinlock_t *ptl;
 
-       i_mmap_assert_locked(mapping);
+       i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -6769,6 +6929,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       i_mmap_unlock_read(mapping);
        return pte;
 }
 
@@ -6779,7 +6940,7 @@ out:
  * indicated by page_count > 1, unmap is achieved by clearing pud and
  * decrementing the ref count. If count == 1, the pte page is not shared.
  *
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
  *
  * returns: 1 successfully unmapped a shared pte page
  *         0 the underlying pte page is not shared, or it is the last user
@@ -6792,6 +6953,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
        pud_t *pud = pud_offset(p4d, addr);
 
        i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+       hugetlb_vma_assert_locked(vma);
        BUG_ON(page_count(virt_to_page(ptep)) == 0);
        if (page_count(virt_to_page(ptep)) == 1)
                return 0;
@@ -6803,6 +6965,48 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
 }
 
 #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
 pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                      unsigned long addr, pud_t *pud)
 {
@@ -7173,6 +7377,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                start, end);
        mmu_notifier_invalidate_range_start(&range);
+       hugetlb_vma_lock_write(vma);
        i_mmap_lock_write(vma->vm_file->f_mapping);
        for (address = start; address < end; address += PUD_SIZE) {
                ptep = huge_pte_offset(mm, address, sz);
@@ -7184,6 +7389,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
        }
        flush_hugetlb_tlb_range(vma, start, end);
        i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
        /*
         * No need to call mmu_notifier_invalidate_range(), see
         * Documentation/mm/mmu_notifier.rst.
@@ -7334,7 +7540,7 @@ void __init hugetlb_cma_reserve(int order)
                hugetlb_cma_size = 0;
 }
 
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
 {
        if (!hugetlb_cma_size || cma_reserve_called)
                return;