Merge tag 'loongarch-6.1' of git://git.kernel.org/pub/scm/linux/kernel/git/chenhuacai...

[linux-2.6-microblaze.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 0bdfc7e..0ad53ad 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -33,6 +33,7 @@
  #include <linux/migrate.h>
  #include <linux/nospec.h>
  #include <linux/delayacct.h>
+#include <linux/memory.h>
  
  #include <asm/page.h>
  #include <asm/pgalloc.h>
@@ -90,6 +91,9 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
  
  /* Forward declaration */
  static int hugetlb_acct_memory(struct hstate *h, long delta);
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
  
  static inline bool subpool_is_free(struct hugepage_subpool *spool)
  {
@@ -257,7 +261,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
  static struct file_region *
  get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
  {
-       struct file_region *nrg = NULL;
+       struct file_region *nrg;
  
         VM_BUG_ON(resv->region_cache_count <= 0);
  
@@ -339,7 +343,7 @@ static bool has_same_uncharge_info(struct file_region *rg,
  
  static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
  {
-       struct file_region *nrg = NULL, *prg = NULL;
+       struct file_region *nrg, *prg;
  
         prg = list_prev_entry(rg, link);
         if (&prg->link != &resv->regions && prg->to == rg->from &&
@@ -456,14 +460,12 @@ static int allocate_file_region_entries(struct resv_map *resv,
                                         int regions_needed)
         __must_hold(&resv->lock)
  {
-       struct list_head allocated_regions;
+       LIST_HEAD(allocated_regions);
         int to_allocate = 0, i = 0;
         struct file_region *trg = NULL, *rg = NULL;
  
         VM_BUG_ON(regions_needed < 0);
  
-       INIT_LIST_HEAD(&allocated_regions);
-
         /*
          * Check for sufficient descriptors in the cache to accommodate
          * the number of in progress add operations plus regions_needed.
@@ -860,7 +862,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
   * faults in a MAP_PRIVATE mapping. Only the process that called mmap()
   * is guaranteed to have their future faults succeed.
   *
- * With the exception of reset_vma_resv_huge_pages() which is called at fork(),
+ * With the exception of hugetlb_dup_vma_private() which is called at fork(),
   * the reserve counters are updated with the hugetlb_lock held. It is safe
   * to reset the VMA at fork() time as it is not in use yet and there is no
   * chance of the global counters getting corrupted as a result of the values.
@@ -1007,12 +1009,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag)
         return (get_vma_private_data(vma) & flag) != 0;
  }
  
-/* Reset counters to 0 and clear all HPAGE_RESV_* flags */
-void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
+void hugetlb_dup_vma_private(struct vm_area_struct *vma)
  {
         VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma);
+       /*
+        * Clear vm_private_data
+        * - For MAP_PRIVATE mappings, this is the reserve map which does
+        *   not apply to children.  Faults generated by the children are
+        *   not guaranteed to succeed, even if read-only.
+        * - For shared mappings this is a per-vma semaphore that may be
+        *   allocated in a subsequent call to hugetlb_vm_op_open.
+        */
+       vma->vm_private_data = (void *)0;
         if (!(vma->vm_flags & VM_MAYSHARE))
-               vma->vm_private_data = (void *)0;
+               return;
  }
  
  /*
@@ -1043,7 +1053,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
                 kref_put(&reservations->refs, resv_map_release);
         }
  
-       reset_vma_resv_huge_pages(vma);
+       hugetlb_dup_vma_private(vma);
  }
  
  /* Returns true if the VMA has associated reserve pages */
@@ -1182,6 +1192,11 @@ retry_cpuset:
         return NULL;
  }
  
+static unsigned long available_huge_pages(struct hstate *h)
+{
+       return h->free_huge_pages - h->resv_huge_pages;
+}
+
  static struct page *dequeue_huge_page_vma(struct hstate *h,
                                 struct vm_area_struct *vma,
                                 unsigned long address, int avoid_reserve,
@@ -1198,12 +1213,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h,
          * have no page reserves. This check ensures that reservations are
          * not "stolen". The child may still get SIGKILLed
          */
-       if (!vma_has_reserves(vma, chg) &&
-                       h->free_huge_pages - h->resv_huge_pages == 0)
+       if (!vma_has_reserves(vma, chg) && !available_huge_pages(h))
                 goto err;
  
         /* If reserves cannot be used, ensure enough pages are in the pool */
-       if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0)
+       if (avoid_reserve && !available_huge_pages(h))
                 goto err;
  
         gfp_mask = htlb_alloc_mask(h);
@@ -1308,12 +1322,13 @@ static void __destroy_compound_gigantic_page(struct page *page,
  {
         int i;
         int nr_pages = 1 << order;
-       struct page *p = page + 1;
+       struct page *p;
  
         atomic_set(compound_mapcount_ptr(page), 0);
         atomic_set(compound_pincount_ptr(page), 0);
  
-       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+       for (i = 1; i < nr_pages; i++) {
+               p = nth_page(page, i);
                 p->mapping = NULL;
                 clear_compound_head(p);
                 if (!demote)
@@ -1506,6 +1521,10 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
  
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         set_page_private(page, 0);
+       /*
+        * We have to set HPageVmemmapOptimized again as above
+        * set_page_private(page, 0) cleared it.
+        */
         SetHPageVmemmapOptimized(page);
  
         /*
@@ -1530,7 +1549,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page,
  static void __update_and_free_page(struct hstate *h, struct page *page)
  {
         int i;
-       struct page *subpage = page;
+       struct page *subpage;
  
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
@@ -1561,8 +1580,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page)
         if (unlikely(PageHWPoison(page)))
                 hugetlb_clear_page_hwpoison(page);
  
-       for (i = 0; i < pages_per_huge_page(h);
-            i++, subpage = mem_map_next(subpage, page, i)) {
+       for (i = 0; i < pages_per_huge_page(h); i++) {
+               subpage = nth_page(page, i);
                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
                                 1 << PG_referenced | 1 << PG_dirty |
                                 1 << PG_active | 1 << PG_private |
@@ -1769,13 +1788,14 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
  {
         int i, j;
         int nr_pages = 1 << order;
-       struct page *p = page + 1;
+       struct page *p;
  
         /* we rely on prep_new_huge_page to set the destructor */
         set_compound_order(page, order);
-       __ClearPageReserved(page);
         __SetPageHead(page);
-       for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
+       for (i = 0; i < nr_pages; i++) {
+               p = nth_page(page, i);
+
                 /*
                  * For gigantic hugepages allocated through bootmem at
                  * boot, it's safer to be consistent with the not-gigantic
@@ -1814,22 +1834,26 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order,
                 } else {
                         VM_BUG_ON_PAGE(page_count(p), p);
                 }
-               set_compound_head(p, page);
+               if (i != 0)
+                       set_compound_head(p, page);
         }
         atomic_set(compound_mapcount_ptr(page), -1);
         atomic_set(compound_pincount_ptr(page), 0);
         return true;
  
  out_error:
-       /* undo tail page modifications made above */
-       p = page + 1;
-       for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
-               clear_compound_head(p);
+       /* undo page modifications made above */
+       for (j = 0; j < i; j++) {
+               p = nth_page(page, j);
+               if (j != 0)
+                       clear_compound_head(p);
                 set_page_refcounted(p);
         }
         /* need to clear PG_reserved on remaining tail pages  */
-       for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+       for (; j < nr_pages; j++) {
+               p = nth_page(page, j);
                 __ClearPageReserved(p);
+       }
         set_compound_order(page, 0);
  #ifdef CONFIG_64BIT
         page[1].compound_nr = 0;
@@ -1918,6 +1942,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
         int order = huge_page_order(h);
         struct page *page;
         bool alloc_try_hard = true;
+       bool retry = true;
  
         /*
          * By default we always try hard to allocate the page with
@@ -1933,7 +1958,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
                 gfp_mask |= __GFP_RETRY_MAYFAIL;
         if (nid == NUMA_NO_NODE)
                 nid = numa_mem_id();
+retry:
         page = __alloc_pages(gfp_mask, order, nid, nmask);
+
+       /* Freeze head page */
+       if (page && !page_ref_freeze(page, 1)) {
+               __free_pages(page, order);
+               if (retry) {    /* retry once */
+                       retry = false;
+                       goto retry;
+               }
+               /* WOW!  twice in a row. */
+               pr_warn("HugeTLB head page unexpected inflated ref count\n");
+               page = NULL;
+       }
+
         if (page)
                 __count_vm_event(HTLB_BUDDY_PGALLOC);
         else
@@ -1961,6 +2000,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h,
  /*
   * Common helper to allocate a fresh hugetlb page. All specific allocators
   * should use this function to get new hugetlb pages
+ *
+ * Note that returned page is 'frozen':  ref count of head page and all tail
+ * pages is zero.
   */
  static struct page *alloc_fresh_huge_page(struct hstate *h,
                 gfp_t gfp_mask, int nid, nodemask_t *nmask,
@@ -2018,7 +2060,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
         if (!page)
                 return 0;
  
-       put_page(page); /* free it into the hugepage allocator */
+       free_huge_page(page); /* free it into the hugepage allocator */
  
         return 1;
  }
@@ -2087,7 +2129,7 @@ retry:
         if (!page_count(page)) {
                 struct page *head = compound_head(page);
                 struct hstate *h = page_hstate(head);
-               if (h->free_huge_pages - h->resv_huge_pages == 0)
+               if (!available_huge_pages(h))
                         goto out;
  
                 /*
@@ -2175,10 +2217,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
   * Allocates a fresh surplus page from the page allocator.
   */
  static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
-               int nid, nodemask_t *nmask, bool zero_ref)
+                                               int nid, nodemask_t *nmask)
  {
         struct page *page = NULL;
-       bool retry = false;
  
         if (hstate_is_gigantic(h))
                 return NULL;
@@ -2188,7 +2229,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
                 goto out_unlock;
         spin_unlock_irq(&hugetlb_lock);
  
-retry:
         page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
         if (!page)
                 return NULL;
@@ -2204,34 +2244,10 @@ retry:
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                 SetHPageTemporary(page);
                 spin_unlock_irq(&hugetlb_lock);
-               put_page(page);
+               free_huge_page(page);
                 return NULL;
         }
  
-       if (zero_ref) {
-               /*
-                * Caller requires a page with zero ref count.
-                * We will drop ref count here.  If someone else is holding
-                * a ref, the page will be freed when they drop it.  Abuse
-                * temporary page flag to accomplish this.
-                */
-               SetHPageTemporary(page);
-               if (!put_page_testzero(page)) {
-                       /*
-                        * Unexpected inflated ref count on freshly allocated
-                        * huge.  Retry once.
-                        */
-                       pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
-                       spin_unlock_irq(&hugetlb_lock);
-                       if (retry)
-                               return NULL;
-
-                       retry = true;
-                       goto retry;
-               }
-               ClearHPageTemporary(page);
-       }
-
         h->surplus_huge_pages++;
         h->surplus_huge_pages_node[page_to_nid(page)]++;
  
@@ -2253,6 +2269,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
         if (!page)
                 return NULL;
  
+       /* fresh huge pages are frozen */
+       set_page_refcounted(page);
+
         /*
          * We do not account these pages as surplus because they are only
          * temporary and will be released properly on the last reference
@@ -2280,14 +2299,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
                 gfp_t gfp = gfp_mask | __GFP_NOWARN;
  
                 gfp &=  ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
-               page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false);
+               page = alloc_surplus_huge_page(h, gfp, nid, nodemask);
  
                 /* Fallback to all nodes if page==NULL */
                 nodemask = NULL;
         }
  
         if (!page)
-               page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false);
+               page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask);
         mpol_cond_put(mpol);
         return page;
  }
@@ -2297,7 +2316,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask, gfp_t gfp_mask)
  {
         spin_lock_irq(&hugetlb_lock);
-       if (h->free_huge_pages - h->resv_huge_pages > 0) {
+       if (available_huge_pages(h)) {
                 struct page *page;
  
                 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
@@ -2336,7 +2355,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
  static int gather_surplus_pages(struct hstate *h, long delta)
         __must_hold(&hugetlb_lock)
  {
-       struct list_head surplus_list;
+       LIST_HEAD(surplus_list);
         struct page *page, *tmp;
         int ret;
         long i;
@@ -2351,14 +2370,13 @@ static int gather_surplus_pages(struct hstate *h, long delta)
         }
  
         allocated = 0;
-       INIT_LIST_HEAD(&surplus_list);
  
         ret = -ENOMEM;
  retry:
         spin_unlock_irq(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
                 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
-                               NUMA_NO_NODE, NULL, true);
+                               NUMA_NO_NODE, NULL);
                 if (!page) {
                         alloc_ok = false;
                         break;
@@ -2720,7 +2738,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
  {
         gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
         int nid = page_to_nid(old_page);
-       bool alloc_retry = false;
         struct page *new_page;
         int ret = 0;
  
@@ -2731,30 +2748,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
          * the pool.  This simplifies and let us do most of the processing
          * under the lock.
          */
-alloc_retry:
         new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
         if (!new_page)
                 return -ENOMEM;
-       /*
-        * If all goes well, this page will be directly added to the free
-        * list in the pool.  For this the ref count needs to be zero.
-        * Attempt to drop now, and retry once if needed.  It is VERY
-        * unlikely there is another ref on the page.
-        *
-        * If someone else has a reference to the page, it will be freed
-        * when they drop their ref.  Abuse temporary page flag to accomplish
-        * this.  Retry once if there is an inflated ref count.
-        */
-       SetHPageTemporary(new_page);
-       if (!put_page_testzero(new_page)) {
-               if (alloc_retry)
-                       return -EBUSY;
-
-               alloc_retry = true;
-               goto alloc_retry;
-       }
-       ClearHPageTemporary(new_page);
-
         __prep_new_huge_page(h, new_page);
  
  retry:
@@ -2934,6 +2930,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                 }
                 spin_lock_irq(&hugetlb_lock);
                 list_add(&page->lru, &h->hugepage_activelist);
+               set_page_refcounted(page);
                 /* Fall through */
         }
         hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
@@ -3038,7 +3035,7 @@ static void __init gather_bootmem_prealloc(void)
                 if (prep_compound_gigantic_page(page, huge_page_order(h))) {
                         WARN_ON(PageReserved(page));
                         prep_new_huge_page(h, page, page_to_nid(page));
-                       put_page(page); /* add to the hugepage allocator */
+                       free_huge_page(page); /* add to the hugepage allocator */
                 } else {
                         /* VERY unlikely inflated ref count on a tail page */
                         free_gigantic_page(page, huge_page_order(h));
@@ -3070,7 +3067,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid)
                                         &node_states[N_MEMORY], NULL);
                         if (!page)
                                 break;
-                       put_page(page); /* free it into the hugepage allocator */
+                       free_huge_page(page); /* free it into the hugepage allocator */
                 }
                 cond_resched();
         }
@@ -3461,9 +3458,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
                 else
                         prep_compound_page(subpage, target_hstate->order);
                 set_page_private(subpage, 0);
-               set_page_refcounted(subpage);
                 prep_new_huge_page(target_hstate, subpage, nid);
-               put_page(subpage);
+               free_huge_page(subpage);
         }
         mutex_unlock(&target_hstate->resize_lock);
  
@@ -3474,7 +3470,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
          * based on pool changes for the demoted page.
          */
         h->max_huge_pages--;
-       target_hstate->max_huge_pages += pages_per_huge_page(h);
+       target_hstate->max_huge_pages +=
+               pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
  
         return rc;
  }
@@ -3716,7 +3713,7 @@ static ssize_t demote_store(struct kobject *kobj,
         unsigned long nr_available;
         nodemask_t nodes_allowed, *n_mask;
         struct hstate *h;
-       int err = 0;
+       int err;
         int nid;
  
         err = kstrtoul(buf, 10, &nr_demote);
@@ -3767,8 +3764,7 @@ HSTATE_ATTR_WO(demote);
  static ssize_t demote_size_show(struct kobject *kobj,
                                         struct kobj_attribute *attr, char *buf)
  {
-       int nid;
-       struct hstate *h = kobj_to_hstate(kobj, &nid);
+       struct hstate *h = kobj_to_hstate(kobj, NULL);
         unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K;
  
         return sysfs_emit(buf, "%lukB\n", demote_size);
@@ -3781,7 +3777,6 @@ static ssize_t demote_size_store(struct kobject *kobj,
         struct hstate *h, *demote_hstate;
         unsigned long demote_size;
         unsigned int demote_order;
-       int nid;
  
         demote_size = (unsigned long)memparse(buf, NULL);
  
@@ -3793,7 +3788,7 @@ static ssize_t demote_size_store(struct kobject *kobj,
                 return -EINVAL;
  
         /* demote order must be smaller than hstate order */
-       h = kobj_to_hstate(kobj, &nid);
+       h = kobj_to_hstate(kobj, NULL);
         if (demote_order >= h->order)
                 return -EINVAL;
  
@@ -3847,35 +3842,26 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent,
         if (retval) {
                 kobject_put(hstate_kobjs[hi]);
                 hstate_kobjs[hi] = NULL;
+               return retval;
         }
  
         if (h->demote_order) {
-               if (sysfs_create_group(hstate_kobjs[hi],
-                                       &hstate_demote_attr_group))
+               retval = sysfs_create_group(hstate_kobjs[hi],
+                                           &hstate_demote_attr_group);
+               if (retval) {
                         pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name);
+                       sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group);
+                       kobject_put(hstate_kobjs[hi]);
+                       hstate_kobjs[hi] = NULL;
+                       return retval;
+               }
         }
  
-       return retval;
-}
-
-static void __init hugetlb_sysfs_init(void)
-{
-       struct hstate *h;
-       int err;
-
-       hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
-       if (!hugepages_kobj)
-               return;
-
-       for_each_hstate(h) {
-               err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
-                                        hstate_kobjs, &hstate_attr_group);
-               if (err)
-                       pr_err("HugeTLB: Unable to add hstate %s", h->name);
-       }
+       return 0;
  }
  
  #ifdef CONFIG_NUMA
+static bool hugetlb_sysfs_initialized __ro_after_init;
  
  /*
   * node_hstate/s - associate per node hstate attributes, via their kobjects,
@@ -3931,7 +3917,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp)
   * Unregister hstate attributes from a single node device.
   * No-op if no hstate attributes attached.
   */
-static void hugetlb_unregister_node(struct node *node)
+void hugetlb_unregister_node(struct node *node)
  {
         struct hstate *h;
         struct node_hstate *nhs = &node_hstates[node->dev.id];
@@ -3941,10 +3927,15 @@ static void hugetlb_unregister_node(struct node *node)
  
         for_each_hstate(h) {
                 int idx = hstate_index(h);
-               if (nhs->hstate_kobjs[idx]) {
-                       kobject_put(nhs->hstate_kobjs[idx]);
-                       nhs->hstate_kobjs[idx] = NULL;
-               }
+               struct kobject *hstate_kobj = nhs->hstate_kobjs[idx];
+
+               if (!hstate_kobj)
+                       continue;
+               if (h->demote_order)
+                       sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group);
+               sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group);
+               kobject_put(hstate_kobj);
+               nhs->hstate_kobjs[idx] = NULL;
         }
  
         kobject_put(nhs->hugepages_kobj);
@@ -3956,12 +3947,15 @@ static void hugetlb_unregister_node(struct node *node)
   * Register hstate attributes for a single node device.
   * No-op if attributes already registered.
   */
-static void hugetlb_register_node(struct node *node)
+void hugetlb_register_node(struct node *node)
  {
         struct hstate *h;
         struct node_hstate *nhs = &node_hstates[node->dev.id];
         int err;
  
+       if (!hugetlb_sysfs_initialized)
+               return;
+
         if (nhs->hugepages_kobj)
                 return;         /* already allocated */
  
@@ -3992,18 +3986,8 @@ static void __init hugetlb_register_all_nodes(void)
  {
         int nid;
  
-       for_each_node_state(nid, N_MEMORY) {
-               struct node *node = node_devices[nid];
-               if (node->dev.id == nid)
-                       hugetlb_register_node(node);
-       }
-
-       /*
-        * Let the node device driver know we're here so it can
-        * [un]register hstate attributes on node hotplug.
-        */
-       register_hugetlbfs_with_node(hugetlb_register_node,
-                                    hugetlb_unregister_node);
+       for_each_online_node(nid)
+               hugetlb_register_node(node_devices[nid]);
  }
  #else  /* !CONFIG_NUMA */
  
@@ -4019,6 +4003,36 @@ static void hugetlb_register_all_nodes(void) { }
  
  #endif
  
+#ifdef CONFIG_CMA
+static void __init hugetlb_cma_check(void);
+#else
+static inline __init void hugetlb_cma_check(void)
+{
+}
+#endif
+
+static void __init hugetlb_sysfs_init(void)
+{
+       struct hstate *h;
+       int err;
+
+       hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj);
+       if (!hugepages_kobj)
+               return;
+
+       for_each_hstate(h) {
+               err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
+                                        hstate_kobjs, &hstate_attr_group);
+               if (err)
+                       pr_err("HugeTLB: Unable to add hstate %s", h->name);
+       }
+
+#ifdef CONFIG_NUMA
+       hugetlb_sysfs_initialized = true;
+#endif
+       hugetlb_register_all_nodes();
+}
+
  static int __init hugetlb_init(void)
  {
         int i;
@@ -4073,7 +4087,6 @@ static int __init hugetlb_init(void)
         report_hugepages();
  
         hugetlb_sysfs_init();
-       hugetlb_register_all_nodes();
         hugetlb_cgroup_file_init();
  
  #ifdef CONFIG_SMP
@@ -4118,7 +4131,7 @@ void __init hugetlb_add_hstate(unsigned int order)
         h->next_nid_to_alloc = first_memory_node;
         h->next_nid_to_free = first_memory_node;
         snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
-                                       huge_page_size(h)/1024);
+                                       huge_page_size(h)/SZ_1K);
  
         parsed_hstate = h;
  }
@@ -4133,11 +4146,11 @@ static void __init hugepages_clear_pages_in_node(void)
         if (!hugetlb_max_hstate) {
                 default_hstate_max_huge_pages = 0;
                 memset(default_hugepages_in_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(default_hugepages_in_node));
         } else {
                 parsed_hstate->max_huge_pages = 0;
                 memset(parsed_hstate->max_huge_pages_node, 0,
-                       MAX_NUMNODES * sizeof(unsigned int));
+                       sizeof(parsed_hstate->max_huge_pages_node));
         }
  }
  
@@ -4332,18 +4345,34 @@ static int __init default_hugepagesz_setup(char *s)
  }
  __setup("default_hugepagesz=", default_hugepagesz_setup);
  
+static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
+{
+#ifdef CONFIG_NUMA
+       struct mempolicy *mpol = get_task_policy(current);
+
+       /*
+        * Only enforce MPOL_BIND policy which overlaps with cpuset policy
+        * (from policy_nodemask) specifically for hugetlb case
+        */
+       if (mpol->mode == MPOL_BIND &&
+               (apply_policy_zone(mpol, gfp_zone(gfp)) &&
+                cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
+               return &mpol->nodes;
+#endif
+       return NULL;
+}
+
  static unsigned int allowed_mems_nr(struct hstate *h)
  {
         int node;
         unsigned int nr = 0;
-       nodemask_t *mpol_allowed;
+       nodemask_t *mbind_nodemask;
         unsigned int *array = h->free_huge_pages_node;
         gfp_t gfp_mask = htlb_alloc_mask(h);
  
-       mpol_allowed = policy_nodemask_current(gfp_mask);
-
+       mbind_nodemask = policy_mbind_nodemask(gfp_mask);
         for_each_node_mask(node, cpuset_current_mems_allowed) {
-               if (!mpol_allowed || node_isset(node, *mpol_allowed))
+               if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
                         nr += array[node];
         }
  
@@ -4583,16 +4612,28 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma)
                 resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
                 kref_get(&resv->refs);
         }
+
+       /*
+        * vma_lock structure for sharable mappings is vma specific.
+        * Clear old pointer (if copied via vm_area_dup) and create new.
+        */
+       if (vma->vm_flags & VM_MAYSHARE) {
+               vma->vm_private_data = NULL;
+               hugetlb_vma_lock_alloc(vma);
+       }
  }
  
  static void hugetlb_vm_op_close(struct vm_area_struct *vma)
  {
         struct hstate *h = hstate_vma(vma);
-       struct resv_map *resv = vma_resv_map(vma);
+       struct resv_map *resv;
         struct hugepage_subpool *spool = subpool_vma(vma);
         unsigned long reserve, start, end;
         long gbl_reserve;
  
+       hugetlb_vma_lock_free(vma);
+
+       resv = vma_resv_map(vma);
         if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                 return;
  
@@ -4723,14 +4764,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                             struct vm_area_struct *dst_vma,
                             struct vm_area_struct *src_vma)
  {
-       pte_t *src_pte, *dst_pte, entry, dst_entry;
+       pte_t *src_pte, *dst_pte, entry;
         struct page *ptepage;
         unsigned long addr;
         bool cow = is_cow_mapping(src_vma->vm_flags);
         struct hstate *h = hstate_vma(src_vma);
         unsigned long sz = huge_page_size(h);
         unsigned long npages = pages_per_huge_page(h);
-       struct address_space *mapping = src_vma->vm_file->f_mapping;
         struct mmu_notifier_range range;
         unsigned long last_addr_mask;
         int ret = 0;
@@ -4744,12 +4784,12 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 raw_write_seqcount_begin(&src->write_protect_seq);
         } else {
                 /*
-                * For shared mappings i_mmap_rwsem must be held to call
-                * huge_pte_alloc, otherwise the returned ptep could go
-                * away if part of a shared pmd and another thread calls
-                * huge_pmd_unshare.
+                * For shared mappings the vma lock must be held before
+                * calling huge_pte_offset in the src vma. Otherwise, the
+                * returned ptep could go away if part of a shared pmd and
+                * another thread calls huge_pmd_unshare.
                  */
-               i_mmap_lock_read(mapping);
+               hugetlb_vma_lock_read(src_vma);
         }
  
         last_addr_mask = hugetlb_mask_last_page(h);
@@ -4768,15 +4808,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
  
                 /*
                  * If the pagetables are shared don't copy or take references.
-                * dst_pte == src_pte is the common case of src/dest sharing.
                  *
+                * dst_pte == src_pte is the common case of src/dest sharing.
                  * However, src could have 'unshared' and dst shares with
-                * another vma.  If dst_pte !none, this implies sharing.
-                * Check here before taking page table lock, and once again
-                * after taking the lock below.
+                * another vma. So page_count of ptep page is checked instead
+                * to reliably determine whether pte is shared.
                  */
-               dst_entry = huge_ptep_get(dst_pte);
-               if ((dst_pte == src_pte) || !huge_pte_none(dst_entry)) {
+               if (page_count(virt_to_page(dst_pte)) > 1) {
                         addr |= last_addr_mask;
                         continue;
                 }
@@ -4785,13 +4823,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 src_ptl = huge_pte_lockptr(h, src, src_pte);
                 spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                 entry = huge_ptep_get(src_pte);
-               dst_entry = huge_ptep_get(dst_pte);
  again:
-               if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
+               if (huge_pte_none(entry)) {
                         /*
-                        * Skip if src entry none.  Also, skip in the
-                        * unlikely case dst entry !none as this implies
-                        * sharing with another vma.
+                        * Skip if src entry none.
                          */
                         ;
                 } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) {
@@ -4870,7 +4905,7 @@ again:
                                         restore_reserve_on_error(h, dst_vma, addr,
                                                                 new);
                                         put_page(new);
-                                       /* dst_entry won't change as in child */
+                                       /* huge_ptep of dst_pte won't change as in child */
                                         goto again;
                                 }
                                 hugetlb_install_page(dst_vma, dst_pte, addr, new);
@@ -4902,7 +4937,7 @@ again:
                 raw_write_seqcount_end(&src->write_protect_seq);
                 mmu_notifier_invalidate_range_end(&range);
         } else {
-               i_mmap_unlock_read(mapping);
+               hugetlb_vma_unlock_read(src_vma);
         }
  
         return ret;
@@ -4961,6 +4996,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
         mmu_notifier_invalidate_range_start(&range);
         last_addr_mask = hugetlb_mask_last_page(h);
         /* Prevent race with file truncation */
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(mapping);
         for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
                 src_pte = huge_pte_offset(mm, old_addr, sz);
@@ -4992,6 +5028,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
                 flush_tlb_range(vma, old_end - len, old_end);
         mmu_notifier_invalidate_range_end(&range);
         i_mmap_unlock_write(mapping);
+       hugetlb_vma_unlock_write(vma);
  
         return len + old_addr - old_end;
  }
@@ -5139,19 +5176,22 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb,
                           unsigned long end, struct page *ref_page,
                           zap_flags_t zap_flags)
  {
+       hugetlb_vma_lock_write(vma);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+
         __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags);
  
         /*
-        * Clear this flag so that x86's huge_pmd_share page_table_shareable
-        * test will fail on a vma being torn down, and not grab a page table
-        * on its way out.  We're lucky that the flag has such an appropriate
-        * name, and can in fact be safely cleared here. We could clear it
-        * before the __unmap_hugepage_range above, but all that's necessary
-        * is to clear it before releasing the i_mmap_rwsem. This works
-        * because in the context this is called, the VMA is about to be
-        * destroyed and the i_mmap_rwsem is held.
+        * Unlock and free the vma lock before releasing i_mmap_rwsem.  When
+        * the vma_lock is freed, this makes the vma ineligible for pmd
+        * sharing.  And, i_mmap_rwsem is required to set up pmd sharing.
+        * This is important as page tables for this unmapped range will
+        * be asynchrously deleted.  If the page tables are shared, there
+        * will be issues when accessed by someone else.
          */
-       vma->vm_flags &= ~VM_MAYSHARE;
+       __hugetlb_vma_unlock_write_free(vma);
+
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
  }
  
  void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
@@ -5316,11 +5356,10 @@ retry_avoidcopy:
                         u32 hash;
  
                         put_page(old_page);
-                       BUG_ON(huge_pte_none(pte));
                         /*
-                        * Drop hugetlb_fault_mutex and i_mmap_rwsem before
-                        * unmapping.  unmapping needs to hold i_mmap_rwsem
-                        * in write mode.  Dropping i_mmap_rwsem in read mode
+                        * Drop hugetlb_fault_mutex and vma_lock before
+                        * unmapping.  unmapping needs to hold vma_lock
+                        * in write mode.  Dropping vma_lock in read mode
                          * here is OK as COW mappings do not interact with
                          * PMD sharing.
                          *
@@ -5328,13 +5367,13 @@ retry_avoidcopy:
                          */
                         idx = vma_hugecache_offset(h, vma, haddr);
                         hash = hugetlb_fault_mutex_hash(mapping, idx);
+                       hugetlb_vma_unlock_read(vma);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
  
                         unmap_ref_private(mm, vma, old_page, haddr);
  
-                       i_mmap_lock_read(mapping);
                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       hugetlb_vma_lock_read(vma);
                         spin_lock(ptl);
                         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
                         if (likely(ptep &&
@@ -5408,19 +5447,6 @@ out_release_old:
         return ret;
  }
  
-/* Return the pagecache page at a given address within a VMA */
-static struct page *hugetlbfs_pagecache_page(struct hstate *h,
-                       struct vm_area_struct *vma, unsigned long address)
-{
-       struct address_space *mapping;
-       pgoff_t idx;
-
-       mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, address);
-
-       return find_lock_page(mapping, idx);
-}
-
  /*
   * Return whether there is a pagecache page to back given address within VMA.
   * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
@@ -5441,7 +5467,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h,
         return page != NULL;
  }
  
-int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
+int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping,
                            pgoff_t idx)
  {
         struct folio *folio = page_folio(page);
@@ -5478,7 +5504,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
                                                   unsigned long addr,
                                                   unsigned long reason)
  {
-       vm_fault_t ret;
         u32 hash;
         struct vm_fault vmf = {
                 .vma = vma,
@@ -5496,18 +5521,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
         };
  
         /*
-        * hugetlb_fault_mutex and i_mmap_rwsem must be
-        * dropped before handling userfault.  Reacquire
-        * after handling fault to make calling code simpler.
+        * vma_lock and hugetlb_fault_mutex must be dropped before handling
+        * userfault. Also mmap_lock could be dropped due to handling
+        * userfault, any vma operation should be careful from here.
          */
+       hugetlb_vma_unlock_read(vma);
         hash = hugetlb_fault_mutex_hash(mapping, idx);
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-       i_mmap_unlock_read(mapping);
-       ret = handle_userfault(&vmf, reason);
-       i_mmap_lock_read(mapping);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
-       return ret;
+       return handle_userfault(&vmf, reason);
  }
  
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
@@ -5525,6 +5546,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         spinlock_t *ptl;
         unsigned long haddr = address & huge_page_mask(h);
         bool new_page, new_pagecache_page = false;
+       u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
  
         /*
          * Currently, we are forced to kill the process in the event the
@@ -5535,29 +5557,24 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
                 pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
                            current->pid);
-               return ret;
+               goto out;
         }
  
         /*
-        * We can not race with truncation due to holding i_mmap_rwsem.
-        * i_size is modified when holding i_mmap_rwsem, so check here
-        * once for faults beyond end of file.
+        * Use page lock to guard against racing truncation
+        * before we get page_table_lock.
          */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto out;
-
-retry:
         new_page = false;
         page = find_lock_page(mapping, idx);
         if (!page) {
+               size = i_size_read(mapping->host) >> huge_page_shift(h);
+               if (idx >= size)
+                       goto out;
                 /* Check for page in userfault range */
-               if (userfaultfd_missing(vma)) {
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+               if (userfaultfd_missing(vma))
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                        flags, haddr, address,
                                                        VM_UFFD_MISSING);
-                       goto out;
-               }
  
                 page = alloc_huge_page(vma, haddr, 0);
                 if (IS_ERR(page)) {
@@ -5585,11 +5602,17 @@ retry:
                 new_page = true;
  
                 if (vma->vm_flags & VM_MAYSHARE) {
-                       int err = huge_add_to_page_cache(page, mapping, idx);
+                       int err = hugetlb_add_to_page_cache(page, mapping, idx);
                         if (err) {
+                               /*
+                                * err can't be -EEXIST which implies someone
+                                * else consumed the reservation since hugetlb
+                                * fault mutex is held when add a hugetlb page
+                                * to the page cache. So it's safe to call
+                                * restore_reserve_on_error() here.
+                                */
+                               restore_reserve_on_error(h, vma, haddr, page);
                                 put_page(page);
-                               if (err == -EEXIST)
-                                       goto retry;
                                 goto out;
                         }
                         new_pagecache_page = true;
@@ -5617,10 +5640,9 @@ retry:
                 if (userfaultfd_minor(vma)) {
                         unlock_page(page);
                         put_page(page);
-                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                       return hugetlb_handle_userfault(vma, mapping, idx,
                                                        flags, haddr, address,
                                                        VM_UFFD_MINOR);
-                       goto out;
                 }
         }
  
@@ -5678,15 +5700,17 @@ retry:
  
         unlock_page(page);
  out:
+       hugetlb_vma_unlock_read(vma);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
         return ret;
  
  backout:
         spin_unlock(ptl);
  backout_unlocked:
-       unlock_page(page);
-       /* restore reserve for newly allocated pages not in page cache */
         if (new_page && !new_pagecache_page)
                 restore_reserve_on_error(h, vma, haddr, page);
+
+       unlock_page(page);
         put_page(page);
         goto out;
  }
@@ -5747,40 +5771,41 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         }
  
         /*
-        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
-        * until finished with ptep.  This serves two purposes:
-        * 1) It prevents huge_pmd_unshare from being called elsewhere
-        *    and making the ptep no longer valid.
-        * 2) It synchronizes us with i_size modifications during truncation.
+        * Serialize hugepage allocation and instantiation, so that we don't
+        * get spurious allocation failures if two CPUs race to instantiate
+        * the same page in the page cache.
+        */
+       mapping = vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, vma, haddr);
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       /*
+        * Acquire vma lock before calling huge_pte_alloc and hold
+        * until finished with ptep.  This prevents huge_pmd_unshare from
+        * being called elsewhere and making the ptep no longer valid.
          *
          * ptep could have already be assigned via huge_pte_offset.  That
          * is OK, as huge_pte_alloc will return the same value unless
          * something has changed.
          */
-       mapping = vma->vm_file->f_mapping;
-       i_mmap_lock_read(mapping);
+       hugetlb_vma_lock_read(vma);
         ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
         if (!ptep) {
-               i_mmap_unlock_read(mapping);
+               hugetlb_vma_unlock_read(vma);
+               mutex_unlock(&hugetlb_fault_mutex_table[hash]);
                 return VM_FAULT_OOM;
         }
  
-       /*
-        * Serialize hugepage allocation and instantiation, so that we don't
-        * get spurious allocation failures if two CPUs race to instantiate
-        * the same page in the page cache.
-        */
-       idx = vma_hugecache_offset(h, vma, haddr);
-       hash = hugetlb_fault_mutex_hash(mapping, idx);
-       mutex_lock(&hugetlb_fault_mutex_table[hash]);
-
         entry = huge_ptep_get(ptep);
         /* PTE markers should be handled the same way as none pte */
-       if (huge_pte_none_mostly(entry)) {
-               ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
+       if (huge_pte_none_mostly(entry))
+               /*
+                * hugetlb_no_page will drop vma lock and hugetlb fault
+                * mutex internally, which make us return immediately.
+                */
+               return hugetlb_no_page(mm, vma, mapping, idx, address, ptep,
                                       entry, flags);
-               goto out_mutex;
-       }
  
         ret = 0;
  
@@ -5810,7 +5835,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 /* Just decrements count, does not deallocate */
                 vma_end_reservation(h, vma, haddr);
  
-               pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr);
+               pagecache_page = find_lock_page(mapping, idx);
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
@@ -5834,8 +5859,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         unlock_page(pagecache_page);
                         put_page(pagecache_page);
                 }
+               hugetlb_vma_unlock_read(vma);
                 mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-               i_mmap_unlock_read(mapping);
                 return handle_userfault(&vmf, VM_UFFD_WP);
         }
  
@@ -5878,8 +5903,8 @@ out_ptl:
                 put_page(pagecache_page);
         }
  out_mutex:
+       hugetlb_vma_unlock_read(vma);
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-       i_mmap_unlock_read(mapping);
         /*
          * Generally it's safe to hold refcount during waiting page lock. But
          * here we just wait to defer the next page fault to avoid busy loop and
@@ -6007,39 +6032,24 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
  
                 /*
                  * Serialization between remove_inode_hugepages() and
-                * huge_add_to_page_cache() below happens through the
+                * hugetlb_add_to_page_cache() below happens through the
                  * hugetlb_fault_mutex_table that here must be hold by
                  * the caller.
                  */
-               ret = huge_add_to_page_cache(page, mapping, idx);
+               ret = hugetlb_add_to_page_cache(page, mapping, idx);
                 if (ret)
                         goto out_release_nounlock;
                 page_in_pagecache = true;
         }
  
-       ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(h, dst_mm, dst_pte);
  
-       /*
-        * Recheck the i_size after holding PT lock to make sure not
-        * to leave any page mapped (as page_mapped()) beyond the end
-        * of the i_size (remove_inode_hugepages() is strict about
-        * enforcing that). If we bail out here, we'll also leave a
-        * page in the radix tree in the vm_shared case beyond the end
-        * of the i_size, but remove_inode_hugepages() will take care
-        * of it as soon as we drop the hugetlb_fault_mutex_table.
-        */
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       ret = -EFAULT;
-       if (idx >= size)
-               goto out_release_unlock;
-
-       ret = -EEXIST;
         /*
          * We allow to overwrite a pte marker: consider when both MISSING|WP
          * registered, we firstly wr-protect a none pte which has no page cache
          * page backing it, then access the page.
          */
+       ret = -EEXIST;
         if (!huge_pte_none_mostly(huge_ptep_get(dst_pte)))
                 goto out_release_unlock;
  
@@ -6107,7 +6117,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
  
         for (nr = 0; nr < refs; nr++) {
                 if (likely(pages))
-                       pages[nr] = mem_map_offset(page, nr);
+                       pages[nr] = nth_page(page, nr);
                 if (vmas)
                         vmas[nr] = vma;
         }
@@ -6271,7 +6281,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                     (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
  
                 if (pages || vmas)
-                       record_subpages_vmas(mem_map_offset(page, pfn_offset),
+                       record_subpages_vmas(nth_page(page, pfn_offset),
                                              vma, refs,
                                              likely(pages) ? pages + i : NULL,
                                              vmas ? vmas + i : NULL);
@@ -6342,8 +6352,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
         flush_cache_range(vma, range.start, range.end);
  
         mmu_notifier_invalidate_range_start(&range);
-       last_addr_mask = hugetlb_mask_last_page(h);
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(vma->vm_file->f_mapping);
+       last_addr_mask = hugetlb_mask_last_page(h);
         for (; address < end; address += psize) {
                 spinlock_t *ptl;
                 ptep = huge_pte_offset(mm, address, psize);
@@ -6442,6 +6453,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
          * See Documentation/mm/mmu_notifier.rst
          */
         i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
         mmu_notifier_invalidate_range_end(&range);
  
         return pages << h->order;
@@ -6466,6 +6478,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
                 return false;
         }
  
+       /*
+        * vma specific semaphore used for pmd sharing synchronization
+        */
+       hugetlb_vma_lock_alloc(vma);
+
         /*
          * Only apply hugepage reservation if asked. At fault time, an
          * attempt will be made for VM_NORESERVE to allocate a page
@@ -6489,12 +6506,11 @@ bool hugetlb_reserve_pages(struct inode *inode,
                 resv_map = inode_resv_map(inode);
  
                 chg = region_chg(resv_map, from, to, &regions_needed);
-
         } else {
                 /* Private mapping. */
                 resv_map = resv_map_alloc();
                 if (!resv_map)
-                       return false;
+                       goto out_err;
  
                 chg = to - from;
  
@@ -6589,6 +6605,7 @@ out_uncharge_cgroup:
         hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
                                             chg * pages_per_huge_page(h), h_cg);
  out_err:
+       hugetlb_vma_lock_free(vma);
         if (!vma || vma->vm_flags & VM_MAYSHARE)
                 /* Only call region_abort if the region_chg succeeded but the
                  * region_add failed or didn't run.
@@ -6658,35 +6675,37 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma,
         /*
          * match the virtual addresses, permission and the alignment of the
          * page table page.
+        *
+        * Also, vma_lock (vm_private_data) is required for sharing.
          */
         if (pmd_index(addr) != pmd_index(saddr) ||
             vm_flags != svm_flags ||
-           !range_in_vma(svma, sbase, s_end))
+           !range_in_vma(svma, sbase, s_end) ||
+           !svma->vm_private_data)
                 return 0;
  
         return saddr;
  }
  
-static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
-{
-       unsigned long base = addr & PUD_MASK;
-       unsigned long end = base + PUD_SIZE;
-
-       /*
-        * check on proper vm_flags and page table alignment
-        */
-       if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end))
-               return true;
-       return false;
-}
-
  bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
  {
+       unsigned long start = addr & PUD_MASK;
+       unsigned long end = start + PUD_SIZE;
+
  #ifdef CONFIG_USERFAULTFD
         if (uffd_disable_huge_pmd_share(vma))
                 return false;
  #endif
-       return vma_shareable(vma, addr);
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return false;
+       if (!vma->vm_private_data)      /* vma lock required for sharing */
+               return false;
+       if (!range_in_vma(vma, start, end))
+               return false;
+       return true;
  }
  
  /*
@@ -6716,16 +6735,157 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                 *end = ALIGN(*end, PUD_SIZE);
  }
  
+static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma)
+{
+       return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) &&
+               vma->vm_private_data;
+}
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_read(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_write(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               up_write(&vma_lock->rw_sema);
+       }
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+       if (!__vma_shareable_flags_pmd(vma))
+               return 1;
+
+       return down_write_trylock(&vma_lock->rw_sema);
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               lockdep_assert_held(&vma_lock->rw_sema);
+       }
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+       struct hugetlb_vma_lock *vma_lock = container_of(kref,
+                       struct hugetlb_vma_lock, refs);
+
+       kfree(vma_lock);
+}
+
+void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock)
+{
+       struct vm_area_struct *vma = vma_lock->vma;
+
+       /*
+        * vma_lock structure may or not be released as a result of put,
+        * it certainly will no longer be attached to vma so clear pointer.
+        * Semaphore synchronizes access to vma_lock->vma field.
+        */
+       vma_lock->vma = NULL;
+       vma->vm_private_data = NULL;
+       up_write(&vma_lock->rw_sema);
+       kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+       if (__vma_shareable_flags_pmd(vma)) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               __hugetlb_vma_unlock_write_put(vma_lock);
+       }
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+       /*
+        * Only present in sharable vmas.
+        */
+       if (!vma || !__vma_shareable_flags_pmd(vma))
+               return;
+
+       if (vma->vm_private_data) {
+               struct hugetlb_vma_lock *vma_lock = vma->vm_private_data;
+
+               down_write(&vma_lock->rw_sema);
+               __hugetlb_vma_unlock_write_put(vma_lock);
+       }
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+       struct hugetlb_vma_lock *vma_lock;
+
+       /* Only establish in (flags) sharable vmas */
+       if (!vma || !(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       /* Should never get here with non-NULL vm_private_data */
+       if (vma->vm_private_data)
+               return;
+
+       vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL);
+       if (!vma_lock) {
+               /*
+                * If we can not allocate structure, then vma can not
+                * participate in pmd sharing.  This is only a possible
+                * performance enhancement and memory saving issue.
+                * However, the lock is also used to synchronize page
+                * faults with truncation.  If the lock is not present,
+                * unlikely races could leave pages in a file past i_size
+                * until the file is removed.  Warn in the unlikely case of
+                * allocation failure.
+                */
+               pr_warn_once("HugeTLB: unable to allocate vma specific lock\n");
+               return;
+       }
+
+       kref_init(&vma_lock->refs);
+       init_rwsem(&vma_lock->rw_sema);
+       vma_lock->vma = vma;
+       vma->vm_private_data = vma_lock;
+}
+
  /*
   * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
   * and returns the corresponding pte. While this is not necessary for the
   * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner.
- *
- * This routine must be called with i_mmap_rwsem held in at least read mode if
- * sharing is possible.  For hugetlbfs, this prevents removal of any page
- * table entries associated with the address space.  This is important as we
- * are setting up sharing based on existing page table entries (mappings).
+ * code much cleaner. pmd allocation is essential for the shared case because
+ * pud has to be populated inside the same i_mmap_rwsem section - otherwise
+ * racing tasks could either miss the sharing (see huge_pte_offset) or select a
+ * bad pmd for sharing.
   */
  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long addr, pud_t *pud)
@@ -6739,7 +6899,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_t *pte;
         spinlock_t *ptl;
  
-       i_mmap_assert_locked(mapping);
+       i_mmap_lock_read(mapping);
         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                 if (svma == vma)
                         continue;
@@ -6769,6 +6929,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
         spin_unlock(ptl);
  out:
         pte = (pte_t *)pmd_alloc(mm, pud, addr);
+       i_mmap_unlock_read(mapping);
         return pte;
  }
  
@@ -6779,7 +6940,7 @@ out:
   * indicated by page_count > 1, unmap is achieved by clearing pud and
   * decrementing the ref count. If count == 1, the pte page is not shared.
   *
- * Called with page table lock held and i_mmap_rwsem held in write mode.
+ * Called with page table lock held.
   *
   * returns: 1 successfully unmapped a shared pte page
   *         0 the underlying pte page is not shared, or it is the last user
@@ -6792,6 +6953,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         pud_t *pud = pud_offset(p4d, addr);
  
         i_mmap_assert_write_locked(vma->vm_file->f_mapping);
+       hugetlb_vma_assert_locked(vma);
         BUG_ON(page_count(virt_to_page(ptep)) == 0);
         if (page_count(virt_to_page(ptep)) == 1)
                 return 0;
@@ -6803,6 +6965,48 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
  }
  
  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+
+void hugetlb_vma_lock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_read(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_write(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_unlock_write(struct vm_area_struct *vma)
+{
+}
+
+int hugetlb_vma_trylock_write(struct vm_area_struct *vma)
+{
+       return 1;
+}
+
+void hugetlb_vma_assert_locked(struct vm_area_struct *vma)
+{
+}
+
+void hugetlb_vma_lock_release(struct kref *kref)
+{
+}
+
+static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_free(struct vm_area_struct *vma)
+{
+}
+
+static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma)
+{
+}
+
  pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
                       unsigned long addr, pud_t *pud)
  {
@@ -7173,6 +7377,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
                                 start, end);
         mmu_notifier_invalidate_range_start(&range);
+       hugetlb_vma_lock_write(vma);
         i_mmap_lock_write(vma->vm_file->f_mapping);
         for (address = start; address < end; address += PUD_SIZE) {
                 ptep = huge_pte_offset(mm, address, sz);
@@ -7184,6 +7389,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
         }
         flush_hugetlb_tlb_range(vma, start, end);
         i_mmap_unlock_write(vma->vm_file->f_mapping);
+       hugetlb_vma_unlock_write(vma);
         /*
          * No need to call mmu_notifier_invalidate_range(), see
          * Documentation/mm/mmu_notifier.rst.
@@ -7334,7 +7540,7 @@ void __init hugetlb_cma_reserve(int order)
                 hugetlb_cma_size = 0;
  }
  
-void __init hugetlb_cma_check(void)
+static void __init hugetlb_cma_check(void)
  {
         if (!hugetlb_cma_size || cma_reserve_called)
                 return;