Merge branch 'urgent.2021.05.20a' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-2.6-microblaze.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 6c72433..95918f4 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -39,7 +39,6 @@
  #include <linux/hugetlb.h>
  #include <linux/hugetlb_cgroup.h>
  #include <linux/node.h>
-#include <linux/userfaultfd_k.h>
  #include <linux/page_owner.h>
  #include "internal.h"
  
@@ -94,9 +93,10 @@ static inline bool subpool_is_free(struct hugepage_subpool *spool)
         return true;
  }
  
-static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
+static inline void unlock_or_release_subpool(struct hugepage_subpool *spool,
+                                               unsigned long irq_flags)
  {
-       spin_unlock(&spool->lock);
+       spin_unlock_irqrestore(&spool->lock, irq_flags);
  
         /* If no pages are used, and no other handles to the subpool
          * remain, give up any reservations based on minimum size and
@@ -135,10 +135,12 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
  
  void hugepage_put_subpool(struct hugepage_subpool *spool)
  {
-       spin_lock(&spool->lock);
+       unsigned long flags;
+
+       spin_lock_irqsave(&spool->lock, flags);
         BUG_ON(!spool->count);
         spool->count--;
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
  }
  
  /*
@@ -157,7 +159,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
         if (!spool)
                 return ret;
  
-       spin_lock(&spool->lock);
+       spin_lock_irq(&spool->lock);
  
         if (spool->max_hpages != -1) {          /* maximum size accounting */
                 if ((spool->used_hpages + delta) <= spool->max_hpages)
@@ -184,7 +186,7 @@ static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
         }
  
  unlock_ret:
-       spin_unlock(&spool->lock);
+       spin_unlock_irq(&spool->lock);
         return ret;
  }
  
@@ -198,11 +200,12 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
                                        long delta)
  {
         long ret = delta;
+       unsigned long flags;
  
         if (!spool)
                 return delta;
  
-       spin_lock(&spool->lock);
+       spin_lock_irqsave(&spool->lock, flags);
  
         if (spool->max_hpages != -1)            /* maximum size accounting */
                 spool->used_hpages -= delta;
@@ -223,7 +226,7 @@ static long hugepage_subpool_put_pages(struct hugepage_subpool *spool,
          * If hugetlbfs_put_super couldn't free spool due to an outstanding
          * quota reference, free it now.
          */
-       unlock_or_release_subpool(spool);
+       unlock_or_release_subpool(spool, flags);
  
         return ret;
  }
@@ -463,7 +466,7 @@ static int allocate_file_region_entries(struct resv_map *resv,
                               resv->region_cache_count;
  
                 /* At this point, we should have enough entries in the cache
-                * for all the existings adds_in_progress. We should only be
+                * for all the existing adds_in_progress. We should only be
                  * needing to allocate for regions_needed.
                  */
                 VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
@@ -553,7 +556,6 @@ retry:
         resv->adds_in_progress -= in_regions_needed;
  
         spin_unlock(&resv->lock);
-       VM_BUG_ON(add < 0);
         return add;
  }
  
@@ -743,13 +745,20 @@ void hugetlb_fix_reserve_counts(struct inode *inode)
  {
         struct hugepage_subpool *spool = subpool_inode(inode);
         long rsv_adjust;
+       bool reserved = false;
  
         rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (rsv_adjust) {
+       if (rsv_adjust > 0) {
                 struct hstate *h = hstate_inode(inode);
  
-               hugetlb_acct_memory(h, 1);
+               if (!hugetlb_acct_memory(h, 1))
+                       reserved = true;
+       } else if (!rsv_adjust) {
+               reserved = true;
         }
+
+       if (!reserved)
+               pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
  }
  
  /*
@@ -1059,6 +1068,8 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
  static void enqueue_huge_page(struct hstate *h, struct page *page)
  {
         int nid = page_to_nid(page);
+
+       lockdep_assert_held(&hugetlb_lock);
         list_move(&page->lru, &h->hugepage_freelists[nid]);
         h->free_huge_pages++;
         h->free_huge_pages_node[nid]++;
@@ -1068,10 +1079,11 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
  static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
  {
         struct page *page;
-       bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+       bool pin = !!(current->flags & PF_MEMALLOC_PIN);
  
+       lockdep_assert_held(&hugetlb_lock);
         list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
-               if (nocma && is_migrate_cma_page(page))
+               if (pin && !is_pinnable_page(page))
                         continue;
  
                 if (PageHWPoison(page))
@@ -1205,7 +1217,7 @@ static int hstate_next_node_to_alloc(struct hstate *h,
  }
  
  /*
- * helper for free_pool_huge_page() - return the previously saved
+ * helper for remove_pool_huge_page() - return the previously saved
   * node ["this node"] from which to free a huge page.  Advance the
   * next node id whether or not we find a free huge page to free so
   * that the next attempt to free addresses the next node.
@@ -1273,7 +1285,7 @@ static void free_gigantic_page(struct page *page, unsigned int order)
  static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                 int nid, nodemask_t *nodemask)
  {
-       unsigned long nr_pages = 1UL << huge_page_order(h);
+       unsigned long nr_pages = pages_per_huge_page(h);
         if (nid == NUMA_NO_NODE)
                 nid = numa_mem_id();
  
@@ -1327,6 +1339,42 @@ static inline void destroy_compound_gigantic_page(struct page *page,
                                                 unsigned int order) { }
  #endif
  
+/*
+ * Remove hugetlb page from lists, and update dtor so that page appears
+ * as just a compound page.  A reference is held on the page.
+ *
+ * Must be called with hugetlb lock held.
+ */
+static void remove_hugetlb_page(struct hstate *h, struct page *page,
+                                                       bool adjust_surplus)
+{
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
+
+       lockdep_assert_held(&hugetlb_lock);
+       if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+               return;
+
+       list_del(&page->lru);
+
+       if (HPageFreed(page)) {
+               h->free_huge_pages--;
+               h->free_huge_pages_node[nid]--;
+       }
+       if (adjust_surplus) {
+               h->surplus_huge_pages--;
+               h->surplus_huge_pages_node[nid]--;
+       }
+
+       set_page_refcounted(page);
+       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
+
+       h->nr_huge_pages--;
+       h->nr_huge_pages_node[nid]--;
+}
+
  static void update_and_free_page(struct hstate *h, struct page *page)
  {
         int i;
@@ -1335,8 +1383,6 @@ static void update_and_free_page(struct hstate *h, struct page *page)
         if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                 return;
  
-       h->nr_huge_pages--;
-       h->nr_huge_pages_node[page_to_nid(page)]--;
         for (i = 0; i < pages_per_huge_page(h);
              i++, subpage = mem_map_next(subpage, page, i)) {
                 subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1344,24 +1390,24 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                 1 << PG_active | 1 << PG_private |
                                 1 << PG_writeback);
         }
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
-       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
-       set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
-       set_page_refcounted(page);
         if (hstate_is_gigantic(h)) {
-               /*
-                * Temporarily drop the hugetlb_lock, because
-                * we might block in free_gigantic_page().
-                */
-               spin_unlock(&hugetlb_lock);
                 destroy_compound_gigantic_page(page, huge_page_order(h));
                 free_gigantic_page(page, huge_page_order(h));
-               spin_lock(&hugetlb_lock);
         } else {
                 __free_pages(page, huge_page_order(h));
         }
  }
  
+static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
+{
+       struct page *page, *t_page;
+
+       list_for_each_entry_safe(page, t_page, list, lru) {
+               update_and_free_page(h, page);
+               cond_resched();
+       }
+}
+
  struct hstate *size_to_hstate(unsigned long size)
  {
         struct hstate *h;
@@ -1373,7 +1419,7 @@ struct hstate *size_to_hstate(unsigned long size)
         return NULL;
  }
  
-static void __free_huge_page(struct page *page)
+void free_huge_page(struct page *page)
  {
         /*
          * Can't pass hstate in here because it is called from the
@@ -1383,6 +1429,7 @@ static void __free_huge_page(struct page *page)
         int nid = page_to_nid(page);
         struct hugepage_subpool *spool = hugetlb_page_subpool(page);
         bool restore_reserve;
+       unsigned long flags;
  
         VM_BUG_ON_PAGE(page_count(page), page);
         VM_BUG_ON_PAGE(page_mapcount(page), page);
@@ -1411,7 +1458,7 @@ static void __free_huge_page(struct page *page)
                         restore_reserve = true;
         }
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irqsave(&hugetlb_lock, flags);
         ClearHPageMigratable(page);
         hugetlb_cgroup_uncharge_page(hstate_index(h),
                                      pages_per_huge_page(h), page);
@@ -1421,82 +1468,46 @@ static void __free_huge_page(struct page *page)
                 h->resv_huge_pages++;
  
         if (HPageTemporary(page)) {
-               list_del(&page->lru);
-               ClearHPageTemporary(page);
+               remove_hugetlb_page(h, page, false);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                 update_and_free_page(h, page);
         } else if (h->surplus_huge_pages_node[nid]) {
                 /* remove the page from active list */
-               list_del(&page->lru);
+               remove_hugetlb_page(h, page, true);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
                 update_and_free_page(h, page);
-               h->surplus_huge_pages--;
-               h->surplus_huge_pages_node[nid]--;
         } else {
                 arch_clear_hugepage_flags(page);
                 enqueue_huge_page(h, page);
+               spin_unlock_irqrestore(&hugetlb_lock, flags);
         }
-       spin_unlock(&hugetlb_lock);
  }
  
  /*
- * As free_huge_page() can be called from a non-task context, we have
- * to defer the actual freeing in a workqueue to prevent potential
- * hugetlb_lock deadlock.
- *
- * free_hpage_workfn() locklessly retrieves the linked list of pages to
- * be freed and frees them one-by-one. As the page->mapping pointer is
- * going to be cleared in __free_huge_page() anyway, it is reused as the
- * llist_node structure of a lockless linked list of huge pages to be freed.
+ * Must be called with the hugetlb lock held
   */
-static LLIST_HEAD(hpage_freelist);
-
-static void free_hpage_workfn(struct work_struct *work)
+static void __prep_account_new_huge_page(struct hstate *h, int nid)
  {
-       struct llist_node *node;
-       struct page *page;
-
-       node = llist_del_all(&hpage_freelist);
-
-       while (node) {
-               page = container_of((struct address_space **)node,
-                                    struct page, mapping);
-               node = node->next;
-               __free_huge_page(page);
-       }
-}
-static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
-
-void free_huge_page(struct page *page)
-{
-       /*
-        * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
-        */
-       if (!in_task()) {
-               /*
-                * Only call schedule_work() if hpage_freelist is previously
-                * empty. Otherwise, schedule_work() had been called but the
-                * workfn hasn't retrieved the list yet.
-                */
-               if (llist_add((struct llist_node *)&page->mapping,
-                             &hpage_freelist))
-                       schedule_work(&free_hpage_work);
-               return;
-       }
-
-       __free_huge_page(page);
+       lockdep_assert_held(&hugetlb_lock);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
  }
  
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+static void __prep_new_huge_page(struct page *page)
  {
         INIT_LIST_HEAD(&page->lru);
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         hugetlb_set_page_subpool(page, NULL);
         set_hugetlb_cgroup(page, NULL);
         set_hugetlb_cgroup_rsvd(page, NULL);
-       spin_lock(&hugetlb_lock);
-       h->nr_huge_pages++;
-       h->nr_huge_pages_node[nid]++;
-       ClearHPageFreed(page);
-       spin_unlock(&hugetlb_lock);
+}
+
+static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
+{
+       __prep_new_huge_page(page);
+       spin_lock_irq(&hugetlb_lock);
+       __prep_account_new_huge_page(h, nid);
+       spin_unlock_irq(&hugetlb_lock);
  }
  
  static void prep_compound_gigantic_page(struct page *page, unsigned int order)
@@ -1693,17 +1704,20 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
  }
  
  /*
- * Free huge page from pool from next node to free.
- * Attempt to keep persistent huge pages more or less
- * balanced over allowed nodes.
+ * Remove huge page from pool from next node to free.  Attempt to keep
+ * persistent huge pages more or less balanced over allowed nodes.
+ * This routine only 'removes' the hugetlb page.  The caller must make
+ * an additional call to free the page to low level allocators.
   * Called with hugetlb_lock locked.
   */
-static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
-                                                        bool acct_surplus)
+static struct page *remove_pool_huge_page(struct hstate *h,
+                                               nodemask_t *nodes_allowed,
+                                                bool acct_surplus)
  {
         int nr_nodes, node;
-       int ret = 0;
+       struct page *page = NULL;
  
+       lockdep_assert_held(&hugetlb_lock);
         for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
                 /*
                  * If we're returning unused surplus pages, only examine
@@ -1711,23 +1725,14 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
                  */
                 if ((!acct_surplus || h->surplus_huge_pages_node[node]) &&
                     !list_empty(&h->hugepage_freelists[node])) {
-                       struct page *page =
-                               list_entry(h->hugepage_freelists[node].next,
+                       page = list_entry(h->hugepage_freelists[node].next,
                                           struct page, lru);
-                       list_del(&page->lru);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[node]--;
-                       if (acct_surplus) {
-                               h->surplus_huge_pages--;
-                               h->surplus_huge_pages_node[node]--;
-                       }
-                       update_and_free_page(h, page);
-                       ret = 1;
+                       remove_hugetlb_page(h, page, acct_surplus);
                         break;
                 }
         }
  
-       return ret;
+       return page;
  }
  
  /*
@@ -1749,7 +1754,7 @@ retry:
         if (!PageHuge(page))
                 return 0;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         if (!PageHuge(page)) {
                 rc = 0;
                 goto out;
@@ -1758,7 +1763,6 @@ retry:
         if (!page_count(page)) {
                 struct page *head = compound_head(page);
                 struct hstate *h = page_hstate(head);
-               int nid = page_to_nid(head);
                 if (h->free_huge_pages - h->resv_huge_pages == 0)
                         goto out;
  
@@ -1767,7 +1771,7 @@ retry:
                  * when it is dissolved.
                  */
                 if (unlikely(!HPageFreed(head))) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                         cond_resched();
  
                         /*
@@ -1789,15 +1793,14 @@ retry:
                         SetPageHWPoison(page);
                         ClearPageHWPoison(head);
                 }
-               list_del(&head->lru);
-               h->free_huge_pages--;
-               h->free_huge_pages_node[nid]--;
+               remove_hugetlb_page(h, page, false);
                 h->max_huge_pages--;
+               spin_unlock_irq(&hugetlb_lock);
                 update_and_free_page(h, head);
-               rc = 0;
+               return 0;
         }
  out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
         return rc;
  }
  
@@ -1839,16 +1842,16 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         if (hstate_is_gigantic(h))
                 return NULL;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages)
                 goto out_unlock;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
         if (!page)
                 return NULL;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         /*
          * We could have raced with the pool size change.
          * Double check that and simply deallocate the new page
@@ -1858,7 +1861,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
          */
         if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
                 SetHPageTemporary(page);
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                 put_page(page);
                 return NULL;
         } else {
@@ -1867,7 +1870,7 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask,
         }
  
  out_unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         return page;
  }
@@ -1917,17 +1920,17 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
  struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
                 nodemask_t *nmask, gfp_t gfp_mask)
  {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         if (h->free_huge_pages - h->resv_huge_pages > 0) {
                 struct page *page;
  
                 page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask);
                 if (page) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
                         return page;
                 }
         }
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask);
  }
@@ -1964,6 +1967,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
         long needed, allocated;
         bool alloc_ok = true;
  
+       lockdep_assert_held(&hugetlb_lock);
         needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
         if (needed <= 0) {
                 h->resv_huge_pages += delta;
@@ -1975,7 +1979,7 @@ static int gather_surplus_pages(struct hstate *h, long delta)
  
         ret = -ENOMEM;
  retry:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
         for (i = 0; i < needed; i++) {
                 page = alloc_surplus_huge_page(h, htlb_alloc_mask(h),
                                 NUMA_NO_NODE, NULL);
@@ -1992,7 +1996,7 @@ retry:
          * After retaking hugetlb_lock, we need to recalculate 'needed'
          * because either resv_huge_pages or free_huge_pages may have changed.
          */
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         needed = (h->resv_huge_pages + delta) -
                         (h->free_huge_pages + allocated);
         if (needed > 0) {
@@ -2032,12 +2036,12 @@ retry:
                 enqueue_huge_page(h, page);
         }
  free:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         /* Free unnecessary surplus pages to the buddy allocator */
         list_for_each_entry_safe(page, tmp, &surplus_list, lru)
                 put_page(page);
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
  
         return ret;
  }
@@ -2049,17 +2053,17 @@ free:
   *    to the associated reservation map.
   * 2) Free any unused surplus pages that may have been allocated to satisfy
   *    the reservation.  As many as unused_resv_pages may be freed.
- *
- * Called with hugetlb_lock held.  However, the lock could be dropped (and
- * reacquired) during calls to cond_resched_lock.  Whenever dropping the lock,
- * we must make sure nobody else can claim pages we are in the process of
- * freeing.  Do this by ensuring resv_huge_page always is greater than the
- * number of huge pages we plan to free when dropping the lock.
   */
  static void return_unused_surplus_pages(struct hstate *h,
                                         unsigned long unused_resv_pages)
  {
         unsigned long nr_pages;
+       struct page *page;
+       LIST_HEAD(page_list);
+
+       lockdep_assert_held(&hugetlb_lock);
+       /* Uncommit the reservation */
+       h->resv_huge_pages -= unused_resv_pages;
  
         /* Cannot return gigantic pages currently */
         if (hstate_is_gigantic(h))
@@ -2076,24 +2080,21 @@ static void return_unused_surplus_pages(struct hstate *h,
          * evenly across all nodes with memory. Iterate across these nodes
          * until we can no longer free unreserved surplus pages. This occurs
          * when the nodes with surplus pages have no free pages.
-        * free_pool_huge_page() will balance the freed pages across the
+        * remove_pool_huge_page() will balance the freed pages across the
          * on-line nodes with memory and will handle the hstate accounting.
-        *
-        * Note that we decrement resv_huge_pages as we free the pages.  If
-        * we drop the lock, resv_huge_pages will still be sufficiently large
-        * to cover subsequent pages we may free.
          */
         while (nr_pages--) {
-               h->resv_huge_pages--;
-               unused_resv_pages--;
-               if (!free_pool_huge_page(h, &node_states[N_MEMORY], 1))
+               page = remove_pool_huge_page(h, &node_states[N_MEMORY], 1);
+               if (!page)
                         goto out;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
         }
  
  out:
-       /* Fully uncommit the reservation */
-       h->resv_huge_pages -= unused_resv_pages;
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
  }
  
  
@@ -2175,27 +2176,26 @@ static long __vma_reservation_common(struct hstate *h,
  
         if (vma->vm_flags & VM_MAYSHARE)
                 return ret;
-       else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) {
-               /*
-                * In most cases, reserves always exist for private mappings.
-                * However, a file associated with mapping could have been
-                * hole punched or truncated after reserves were consumed.
-                * As subsequent fault on such a range will not use reserves.
-                * Subtle - The reserve map for private mappings has the
-                * opposite meaning than that of shared mappings.  If NO
-                * entry is in the reserve map, it means a reservation exists.
-                * If an entry exists in the reserve map, it means the
-                * reservation has already been consumed.  As a result, the
-                * return value of this routine is the opposite of the
-                * value returned from reserve map manipulation routines above.
-                */
-               if (ret)
-                       return 0;
-               else
-                       return 1;
-       }
-       else
-               return ret < 0 ? ret : 0;
+       /*
+        * We know private mapping must have HPAGE_RESV_OWNER set.
+        *
+        * In most cases, reserves always exist for private mappings.
+        * However, a file associated with mapping could have been
+        * hole punched or truncated after reserves were consumed.
+        * As subsequent fault on such a range will not use reserves.
+        * Subtle - The reserve map for private mappings has the
+        * opposite meaning than that of shared mappings.  If NO
+        * entry is in the reserve map, it means a reservation exists.
+        * If an entry exists in the reserve map, it means the
+        * reservation has already been consumed.  As a result, the
+        * return value of this routine is the opposite of the
+        * value returned from reserve map manipulation routines above.
+        */
+       if (ret > 0)
+               return 0;
+       if (ret == 0)
+               return 1;
+       return ret;
  }
  
  static long vma_needs_reservation(struct hstate *h,
@@ -2266,6 +2266,134 @@ static void restore_reserve_on_error(struct hstate *h,
         }
  }
  
+/*
+ * alloc_and_dissolve_huge_page - Allocate a new page and dissolve the old one
+ * @h: struct hstate old page belongs to
+ * @old_page: Old page to dissolve
+ * @list: List to isolate the page in case we need to
+ * Returns 0 on success, otherwise negated error.
+ */
+static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
+                                       struct list_head *list)
+{
+       gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
+       int nid = page_to_nid(old_page);
+       struct page *new_page;
+       int ret = 0;
+
+       /*
+        * Before dissolving the page, we need to allocate a new one for the
+        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
+        * not having to deal with prep_new_huge_page() and avoids dealing of any
+        * counters. This simplifies and let us do the whole thing under the
+        * lock.
+        */
+       new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
+       if (!new_page)
+               return -ENOMEM;
+
+retry:
+       spin_lock_irq(&hugetlb_lock);
+       if (!PageHuge(old_page)) {
+               /*
+                * Freed from under us. Drop new_page too.
+                */
+               goto free_new;
+       } else if (page_count(old_page)) {
+               /*
+                * Someone has grabbed the page, try to isolate it here.
+                * Fail with -EBUSY if not possible.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               if (!isolate_huge_page(old_page, list))
+                       ret = -EBUSY;
+               spin_lock_irq(&hugetlb_lock);
+               goto free_new;
+       } else if (!HPageFreed(old_page)) {
+               /*
+                * Page's refcount is 0 but it has not been enqueued in the
+                * freelist yet. Race window is small, so we can succeed here if
+                * we retry.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               cond_resched();
+               goto retry;
+       } else {
+               /*
+                * Ok, old_page is still a genuine free hugepage. Remove it from
+                * the freelist and decrease the counters. These will be
+                * incremented again when calling __prep_account_new_huge_page()
+                * and enqueue_huge_page() for new_page. The counters will remain
+                * stable since this happens under the lock.
+                */
+               remove_hugetlb_page(h, old_page, false);
+
+               /*
+                * new_page needs to be initialized with the standard hugetlb
+                * state. This is normally done by prep_new_huge_page() but
+                * that takes hugetlb_lock which is already held so we need to
+                * open code it here.
+                * Reference count trick is needed because allocator gives us
+                * referenced page but the pool requires pages with 0 refcount.
+                */
+               __prep_new_huge_page(new_page);
+               __prep_account_new_huge_page(h, nid);
+               page_ref_dec(new_page);
+               enqueue_huge_page(h, new_page);
+
+               /*
+                * Pages have been replaced, we can safely free the old one.
+                */
+               spin_unlock_irq(&hugetlb_lock);
+               update_and_free_page(h, old_page);
+       }
+
+       return ret;
+
+free_new:
+       spin_unlock_irq(&hugetlb_lock);
+       __free_pages(new_page, huge_page_order(h));
+
+       return ret;
+}
+
+int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list)
+{
+       struct hstate *h;
+       struct page *head;
+       int ret = -EBUSY;
+
+       /*
+        * The page might have been dissolved from under our feet, so make sure
+        * to carefully check the state under the lock.
+        * Return success when racing as if we dissolved the page ourselves.
+        */
+       spin_lock_irq(&hugetlb_lock);
+       if (PageHuge(page)) {
+               head = compound_head(page);
+               h = page_hstate(head);
+       } else {
+               spin_unlock_irq(&hugetlb_lock);
+               return 0;
+       }
+       spin_unlock_irq(&hugetlb_lock);
+
+       /*
+        * Fence off gigantic pages as there is a cyclic dependency between
+        * alloc_contig_range and them. Return -ENOMEM as this has the effect
+        * of bailing out right away without further retrying.
+        */
+       if (hstate_is_gigantic(h))
+               return -ENOMEM;
+
+       if (page_count(head) && isolate_huge_page(head, list))
+               ret = 0;
+       else if (!page_count(head))
+               ret = alloc_and_dissolve_huge_page(h, head, list);
+
+       return ret;
+}
+
  struct page *alloc_huge_page(struct vm_area_struct *vma,
                                     unsigned long addr, int avoid_reserve)
  {
@@ -2316,7 +2444,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
  
         /* If this allocation is not consuming a reservation, charge it now.
          */
-       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       deferred_reserve = map_chg || avoid_reserve;
         if (deferred_reserve) {
                 ret = hugetlb_cgroup_charge_cgroup_rsvd(
                         idx, pages_per_huge_page(h), &h_cg);
@@ -2328,7 +2456,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
         if (ret)
                 goto out_uncharge_cgroup_reservation;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         /*
          * glb_chg is passed to indicate whether or not a page must be taken
          * from the global free pool (global change).  gbl_chg == 0 indicates
@@ -2336,7 +2464,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
          */
         page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg);
         if (!page) {
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
                 page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
                 if (!page)
                         goto out_uncharge_cgroup;
@@ -2344,7 +2472,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                         SetHPageRestoreReserve(page);
                         h->resv_huge_pages--;
                 }
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                 list_add(&page->lru, &h->hugepage_activelist);
                 /* Fall through */
         }
@@ -2357,7 +2485,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                                                   h_cg, page);
         }
  
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         hugetlb_set_page_subpool(page, spool);
  
@@ -2547,24 +2675,32 @@ static void try_to_free_low(struct hstate *h, unsigned long count,
                                                 nodemask_t *nodes_allowed)
  {
         int i;
+       LIST_HEAD(page_list);
  
+       lockdep_assert_held(&hugetlb_lock);
         if (hstate_is_gigantic(h))
                 return;
  
+       /*
+        * Collect pages to be freed on a list, and free after dropping lock
+        */
         for_each_node_mask(i, *nodes_allowed) {
                 struct page *page, *next;
                 struct list_head *freel = &h->hugepage_freelists[i];
                 list_for_each_entry_safe(page, next, freel, lru) {
                         if (count >= h->nr_huge_pages)
-                               return;
+                               goto out;
                         if (PageHighMem(page))
                                 continue;
-                       list_del(&page->lru);
-                       update_and_free_page(h, page);
-                       h->free_huge_pages--;
-                       h->free_huge_pages_node[page_to_nid(page)]--;
+                       remove_hugetlb_page(h, page, false);
+                       list_add(&page->lru, &page_list);
                 }
         }
+
+out:
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
  }
  #else
  static inline void try_to_free_low(struct hstate *h, unsigned long count,
@@ -2583,6 +2719,7 @@ static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed,
  {
         int nr_nodes, node;
  
+       lockdep_assert_held(&hugetlb_lock);
         VM_BUG_ON(delta != -1 && delta != 1);
  
         if (delta < 0) {
@@ -2610,6 +2747,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                               nodemask_t *nodes_allowed)
  {
         unsigned long min_count, ret;
+       struct page *page;
+       LIST_HEAD(page_list);
         NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
  
         /*
@@ -2622,7 +2761,12 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         else
                 return -ENOMEM;
  
-       spin_lock(&hugetlb_lock);
+       /*
+        * resize_lock mutex prevents concurrent adjustments to number of
+        * pages in hstate via the proc/sysfs interfaces.
+        */
+       mutex_lock(&h->resize_lock);
+       spin_lock_irq(&hugetlb_lock);
  
         /*
          * Check for a node specific request.
@@ -2653,7 +2797,8 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
          */
         if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
                 if (count > persistent_huge_pages(h)) {
-                       spin_unlock(&hugetlb_lock);
+                       spin_unlock_irq(&hugetlb_lock);
+                       mutex_unlock(&h->resize_lock);
                         NODEMASK_FREE(node_alloc_noretry);
                         return -EINVAL;
                 }
@@ -2682,14 +2827,14 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
                  * page, free_huge_page will handle it by freeing the page
                  * and reducing the surplus.
                  */
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
  
                 /* yield cpu to avoid soft lockup */
                 cond_resched();
  
                 ret = alloc_pool_huge_page(h, nodes_allowed,
                                                 node_alloc_noretry);
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                 if (!ret)
                         goto out;
  
@@ -2716,18 +2861,30 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages;
         min_count = max(count, min_count);
         try_to_free_low(h, min_count, nodes_allowed);
+
+       /*
+        * Collect pages to be removed on list without dropping lock
+        */
         while (min_count < persistent_huge_pages(h)) {
-               if (!free_pool_huge_page(h, nodes_allowed, 0))
+               page = remove_pool_huge_page(h, nodes_allowed, 0);
+               if (!page)
                         break;
-               cond_resched_lock(&hugetlb_lock);
+
+               list_add(&page->lru, &page_list);
         }
+       /* free the pages after dropping lock */
+       spin_unlock_irq(&hugetlb_lock);
+       update_and_free_pages_bulk(h, &page_list);
+       spin_lock_irq(&hugetlb_lock);
+
         while (count < persistent_huge_pages(h)) {
                 if (!adjust_pool_surplus(h, nodes_allowed, 1))
                         break;
         }
  out:
         h->max_huge_pages = persistent_huge_pages(h);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
+       mutex_unlock(&h->resize_lock);
  
         NODEMASK_FREE(node_alloc_noretry);
  
@@ -2882,9 +3039,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
         if (err)
                 return err;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         h->nr_overcommit_huge_pages = input;
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
  
         return count;
  }
@@ -3215,6 +3372,7 @@ void __init hugetlb_add_hstate(unsigned int order)
         BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
         BUG_ON(order == 0);
         h = &hstates[hugetlb_max_hstate++];
+       mutex_init(&h->resize_lock);
         h->order = order;
         h->mask = ~(huge_page_size(h) - 1);
         for (i = 0; i < MAX_NUMNODES; ++i)
@@ -3267,10 +3425,10 @@ static int __init hugepages_setup(char *s)
  
         /*
          * Global state is always initialized later in hugetlb_init.
-        * But we need to allocate >= MAX_ORDER hstates here early to still
+        * But we need to allocate gigantic hstates here early to still
          * use the bootmem allocator.
          */
-       if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER)
+       if (hugetlb_max_hstate && hstate_is_gigantic(parsed_hstate))
                 hugetlb_hstate_alloc_pages(parsed_hstate);
  
         last_mhp = mhp;
@@ -3470,9 +3628,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
                 goto out;
  
         if (write) {
-               spin_lock(&hugetlb_lock);
+               spin_lock_irq(&hugetlb_lock);
                 h->nr_overcommit_huge_pages = tmp;
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
         }
  out:
         return ret;
@@ -3568,7 +3726,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
         if (!delta)
                 return 0;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         /*
          * When cpuset is configured, it breaks the strict hugetlb page
          * reservation as the accounting is done on a global variable. Such
@@ -3607,7 +3765,7 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
                 return_unused_surplus_pages(h, (unsigned long) -delta);
  
  out:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
         return ret;
  }
  
@@ -3795,7 +3953,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                 src_pte = huge_pte_offset(src, addr, sz);
                 if (!src_pte)
                         continue;
-               dst_pte = huge_pte_alloc(dst, addr, sz);
+               dst_pte = huge_pte_alloc(dst, vma, addr, sz);
                 if (!dst_pte) {
                         ret = -ENOMEM;
                         break;
@@ -3898,6 +4056,7 @@ again:
                                  * See Documentation/vm/mmu_notifier.rst
                                  */
                                 huge_ptep_set_wrprotect(src, addr, src_pte);
+                               entry = huge_pte_wrprotect(entry);
                         }
  
                         page_dup_rmap(ptepage, true);
@@ -4310,6 +4469,44 @@ int huge_add_to_page_cache(struct page *page, struct address_space *mapping,
         return 0;
  }
  
+static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
+                                                 struct address_space *mapping,
+                                                 pgoff_t idx,
+                                                 unsigned int flags,
+                                                 unsigned long haddr,
+                                                 unsigned long reason)
+{
+       vm_fault_t ret;
+       u32 hash;
+       struct vm_fault vmf = {
+               .vma = vma,
+               .address = haddr,
+               .flags = flags,
+
+               /*
+                * Hard to debug if it ends up being
+                * used by a callee that assumes
+                * something about the other
+                * uninitialized fields... same as in
+                * memory.c
+                */
+       };
+
+       /*
+        * hugetlb_fault_mutex and i_mmap_rwsem must be
+        * dropped before handling userfault.  Reacquire
+        * after handling fault to make calling code simpler.
+        */
+       hash = hugetlb_fault_mutex_hash(mapping, idx);
+       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
+       ret = handle_userfault(&vmf, reason);
+       i_mmap_lock_read(mapping);
+       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
+       return ret;
+}
+
  static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                         struct vm_area_struct *vma,
                         struct address_space *mapping, pgoff_t idx,
@@ -4348,35 +4545,11 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
  retry:
         page = find_lock_page(mapping, idx);
         if (!page) {
-               /*
-                * Check for page in userfault range
-                */
+               /* Check for page in userfault range */
                 if (userfaultfd_missing(vma)) {
-                       u32 hash;
-                       struct vm_fault vmf = {
-                               .vma = vma,
-                               .address = haddr,
-                               .flags = flags,
-                               /*
-                                * Hard to debug if it ends up being
-                                * used by a callee that assumes
-                                * something about the other
-                                * uninitialized fields... same as in
-                                * memory.c
-                                */
-                       };
-
-                       /*
-                        * hugetlb_fault_mutex and i_mmap_rwsem must be
-                        * dropped before handling userfault.  Reacquire
-                        * after handling fault to make calling code simpler.
-                        */
-                       hash = hugetlb_fault_mutex_hash(mapping, idx);
-                       mutex_unlock(&hugetlb_fault_mutex_table[hash]);
-                       i_mmap_unlock_read(mapping);
-                       ret = handle_userfault(&vmf, VM_UFFD_MISSING);
-                       i_mmap_lock_read(mapping);
-                       mutex_lock(&hugetlb_fault_mutex_table[hash]);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MISSING);
                         goto out;
                 }
  
@@ -4395,13 +4568,10 @@ retry:
                          * sure there really is no pte entry.
                          */
                         ptl = huge_pte_lock(h, mm, ptep);
-                       if (!huge_pte_none(huge_ptep_get(ptep))) {
-                               ret = 0;
-                               spin_unlock(ptl);
-                               goto out;
-                       }
+                       ret = 0;
+                       if (huge_pte_none(huge_ptep_get(ptep)))
+                               ret = vmf_error(PTR_ERR(page));
                         spin_unlock(ptl);
-                       ret = vmf_error(PTR_ERR(page));
                         goto out;
                 }
                 clear_huge_page(page, address, pages_per_huge_page(h));
@@ -4435,6 +4605,16 @@ retry:
                                 VM_FAULT_SET_HINDEX(hstate_index(h));
                         goto backout_unlocked;
                 }
+
+               /* Check for page in userfault range. */
+               if (userfaultfd_minor(vma)) {
+                       unlock_page(page);
+                       put_page(page);
+                       ret = hugetlb_handle_userfault(vma, mapping, idx,
+                                                      flags, haddr,
+                                                      VM_UFFD_MINOR);
+                       goto out;
+               }
         }
  
         /*
@@ -4563,7 +4743,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
          */
         mapping = vma->vm_file->f_mapping;
         i_mmap_lock_read(mapping);
-       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h));
         if (!ptep) {
                 i_mmap_unlock_read(mapping);
                 return VM_FAULT_OOM;
@@ -4675,6 +4855,7 @@ out_mutex:
         return ret;
  }
  
+#ifdef CONFIG_USERFAULTFD
  /*
   * Used by userfaultfd UFFDIO_COPY.  Based on mcopy_atomic_pte with
   * modifications for huge pages.
@@ -4684,8 +4865,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                             struct vm_area_struct *dst_vma,
                             unsigned long dst_addr,
                             unsigned long src_addr,
+                           enum mcopy_atomic_mode mode,
                             struct page **pagep)
  {
+       bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
         struct address_space *mapping;
         pgoff_t idx;
         unsigned long size;
@@ -4695,8 +4878,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         spinlock_t *ptl;
         int ret;
         struct page *page;
+       int writable;
+
+       mapping = dst_vma->vm_file->f_mapping;
+       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
  
-       if (!*pagep) {
+       if (is_continue) {
+               ret = -EFAULT;
+               page = find_lock_page(mapping, idx);
+               if (!page)
+                       goto out;
+       } else if (!*pagep) {
                 ret = -ENOMEM;
                 page = alloc_huge_page(dst_vma, dst_addr, 0);
                 if (IS_ERR(page))
@@ -4725,13 +4917,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
          */
         __SetPageUptodate(page);
  
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
-
-       /*
-        * If shared, add to page cache
-        */
-       if (vm_shared) {
+       /* Add shared, newly allocated pages to the page cache. */
+       if (vm_shared && !is_continue) {
                 size = i_size_read(mapping->host) >> huge_page_shift(h);
                 ret = -EFAULT;
                 if (idx >= size)
@@ -4776,8 +4963,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                 hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
         }
  
-       _dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
-       if (dst_vma->vm_flags & VM_WRITE)
+       /* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
+       if (is_continue && !vm_shared)
+               writable = 0;
+       else
+               writable = dst_vma->vm_flags & VM_WRITE;
+
+       _dst_pte = make_huge_pte(dst_vma, page, writable);
+       if (writable)
                 _dst_pte = huge_pte_mkdirty(_dst_pte);
         _dst_pte = pte_mkyoung(_dst_pte);
  
@@ -4791,20 +4984,22 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
         update_mmu_cache(dst_vma, dst_addr, dst_pte);
  
         spin_unlock(ptl);
-       SetHPageMigratable(page);
-       if (vm_shared)
+       if (!is_continue)
+               SetHPageMigratable(page);
+       if (vm_shared || is_continue)
                 unlock_page(page);
         ret = 0;
  out:
         return ret;
  out_release_unlock:
         spin_unlock(ptl);
-       if (vm_shared)
+       if (vm_shared || is_continue)
                 unlock_page(page);
  out_release_nounlock:
         put_page(page);
         goto out;
  }
+#endif /* CONFIG_USERFAULTFD */
  
  static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma,
                                  int refs, struct page **pages,
@@ -4996,14 +5191,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
         return i ? i : err;
  }
  
-#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
-/*
- * ARCHes with special requirements for evicting HUGETLB backing TLB entries can
- * implement this.
- */
-#define flush_hugetlb_tlb_range(vma, addr, end)        flush_tlb_range(vma, addr, end)
-#endif
-
  unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                 unsigned long address, unsigned long end, pgprot_t newprot)
  {
@@ -5280,6 +5467,9 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
         /*
          * If the subpool has a minimum size, the number of global
          * reservations to be released may be adjusted.
+        *
+        * Note that !resv_map implies freed == 0. So (chg - freed)
+        * won't go negative.
          */
         gbl_reserve = hugepage_subpool_put_pages(spool, (chg - freed));
         hugetlb_acct_memory(h, -gbl_reserve);
@@ -5326,6 +5516,15 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
         return false;
  }
  
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+#ifdef CONFIG_USERFAULTFD
+       if (uffd_disable_huge_pmd_share(vma))
+               return false;
+#endif
+       return vma_shareable(vma, addr);
+}
+
  /*
   * Determine if start,end range within vma could be mapped by shared pmd.
   * If yes, adjust start and end to cover range associated with possible
@@ -5338,8 +5537,8 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                 v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
  
         /*
-        * vma need span at least one aligned PUD size and the start,end range
-        * must at least partialy within it.
+        * vma needs to span at least one aligned PUD size, and the range
+        * must be at least partially within in.
          */
         if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
                 (*end <= v_start) || (*start >= v_end))
@@ -5370,9 +5569,9 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
   * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
   * only required for subsequent processing.
   */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
  {
-       struct vm_area_struct *vma = find_vma(mm, addr);
         struct address_space *mapping = vma->vm_file->f_mapping;
         pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
                         vma->vm_pgoff;
@@ -5382,9 +5581,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
         pte_t *pte;
         spinlock_t *ptl;
  
-       if (!vma_shareable(vma, addr))
-               return (pte_t *)pmd_alloc(mm, pud, addr);
-
         i_mmap_assert_locked(mapping);
         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                 if (svma == vma)
@@ -5448,9 +5644,10 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
         *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
         return 1;
  }
-#define want_pmd_share()       (1)
+
  #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
-pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
+                     unsigned long addr, pud_t *pud)
  {
         return NULL;
  }
@@ -5465,11 +5662,15 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                 unsigned long *start, unsigned long *end)
  {
  }
-#define want_pmd_share()       (0)
+
+bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
+{
+       return false;
+}
  #endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
  
  #ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
-pte_t *huge_pte_alloc(struct mm_struct *mm,
+pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
                         unsigned long addr, unsigned long sz)
  {
         pgd_t *pgd;
@@ -5487,8 +5688,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
                         pte = (pte_t *)pud;
                 } else {
                         BUG_ON(sz != PMD_SIZE);
-                       if (want_pmd_share() && pud_none(*pud))
-                               pte = huge_pmd_share(mm, addr, pud);
+                       if (want_pmd_share(vma, addr) && pud_none(*pud))
+                               pte = huge_pmd_share(mm, vma, addr, pud);
                         else
                                 pte = (pte_t *)pmd_alloc(mm, pud, addr);
                 }
@@ -5632,7 +5833,7 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
  {
         bool ret = true;
  
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         if (!PageHeadHuge(page) ||
             !HPageMigratable(page) ||
             !get_page_unless_zero(page)) {
@@ -5642,16 +5843,16 @@ bool isolate_huge_page(struct page *page, struct list_head *list)
         ClearHPageMigratable(page);
         list_move_tail(&page->lru, list);
  unlock:
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
         return ret;
  }
  
  void putback_active_hugepage(struct page *page)
  {
-       spin_lock(&hugetlb_lock);
+       spin_lock_irq(&hugetlb_lock);
         SetHPageMigratable(page);
         list_move_tail(&page->lru, &(page_hstate(page))->hugepage_activelist);
-       spin_unlock(&hugetlb_lock);
+       spin_unlock_irq(&hugetlb_lock);
         put_page(page);
  }
  
@@ -5679,13 +5880,70 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                 SetHPageTemporary(oldpage);
                 ClearHPageTemporary(newpage);
  
-               spin_lock(&hugetlb_lock);
+               /*
+                * There is no need to transfer the per-node surplus state
+                * when we do not cross the node.
+                */
+               if (new_nid == old_nid)
+                       return;
+               spin_lock_irq(&hugetlb_lock);
                 if (h->surplus_huge_pages_node[old_nid]) {
                         h->surplus_huge_pages_node[old_nid]--;
                         h->surplus_huge_pages_node[new_nid]++;
                 }
-               spin_unlock(&hugetlb_lock);
+               spin_unlock_irq(&hugetlb_lock);
+       }
+}
+
+/*
+ * This function will unconditionally remove all the shared pmd pgtable entries
+ * within the specific vma for a hugetlbfs memory range.
+ */
+void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
+{
+       struct hstate *h = hstate_vma(vma);
+       unsigned long sz = huge_page_size(h);
+       struct mm_struct *mm = vma->vm_mm;
+       struct mmu_notifier_range range;
+       unsigned long address, start, end;
+       spinlock_t *ptl;
+       pte_t *ptep;
+
+       if (!(vma->vm_flags & VM_MAYSHARE))
+               return;
+
+       start = ALIGN(vma->vm_start, PUD_SIZE);
+       end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
+
+       if (start >= end)
+               return;
+
+       /*
+        * No need to call adjust_range_if_pmd_sharing_possible(), because
+        * we have already done the PUD_SIZE alignment.
+        */
+       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+                               start, end);
+       mmu_notifier_invalidate_range_start(&range);
+       i_mmap_lock_write(vma->vm_file->f_mapping);
+       for (address = start; address < end; address += PUD_SIZE) {
+               unsigned long tmp = address;
+
+               ptep = huge_pte_offset(mm, address, sz);
+               if (!ptep)
+                       continue;
+               ptl = huge_pte_lock(h, mm, ptep);
+               /* We don't want 'address' to be changed */
+               huge_pmd_unshare(mm, vma, &tmp, ptep);
+               spin_unlock(ptl);
         }
+       flush_hugetlb_tlb_range(vma, start, end);
+       i_mmap_unlock_write(vma->vm_file->f_mapping);
+       /*
+        * No need to call mmu_notifier_invalidate_range(), see
+        * Documentation/vm/mmu_notifier.rst.
+        */
+       mmu_notifier_invalidate_range_end(&range);
  }
  
  #ifdef CONFIG_CMA