Merge tag 'nfs-for-5.7-2' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
[linux-2.6-microblaze.git] / mm / hugetlb.c
index dd8737a..cd45915 100644 (file)
@@ -28,6 +28,7 @@
 #include <linux/jhash.h>
 #include <linux/numa.h>
 #include <linux/llist.h>
+#include <linux/cma.h>
 
 #include <asm/page.h>
 #include <asm/pgtable.h>
@@ -44,6 +45,9 @@
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
 struct hstate hstates[HUGE_MAX_HSTATE];
+
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+
 /*
  * Minimum page order among possible hugepage sizes, set to a proper value
  * at boot time.
@@ -220,132 +224,303 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma)
        return subpool_inode(file_inode(vma->vm_file));
 }
 
-/*
- * Region tracking -- allows tracking of reservations and instantiated pages
- *                    across the pages in a mapping.
- *
- * The region data structures are embedded into a resv_map and protected
- * by a resv_map's lock.  The set of regions within the resv_map represent
- * reservations for huge pages, or huge pages that have already been
- * instantiated within the map.  The from and to elements are huge page
- * indicies into the associated mapping.  from indicates the starting index
- * of the region.  to represents the first index past the end of  the region.
- *
- * For example, a file region structure with from == 0 and to == 4 represents
- * four huge pages in a mapping.  It is important to note that the to element
- * represents the first element past the end of the region. This is used in
- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
- *
- * Interval notation of the form [from, to) will be used to indicate that
- * the endpoint from is inclusive and to is exclusive.
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
  */
-struct file_region {
-       struct list_head link;
-       long from;
-       long to;
-};
+static struct file_region *
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
+{
+       struct file_region *nrg = NULL;
+
+       VM_BUG_ON(resv->region_cache_count <= 0);
+
+       resv->region_cache_count--;
+       nrg = list_first_entry(&resv->region_cache, struct file_region, link);
+       VM_BUG_ON(!nrg);
+       list_del(&nrg->link);
+
+       nrg->from = from;
+       nrg->to = to;
+
+       return nrg;
+}
+
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
+                                             struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       nrg->reservation_counter = rg->reservation_counter;
+       nrg->css = rg->css;
+       if (rg->css)
+               css_get(rg->css);
+#endif
+}
+
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+                                               struct hstate *h,
+                                               struct resv_map *resv,
+                                               struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (h_cg) {
+               nrg->reservation_counter =
+                       &h_cg->rsvd_hugepage[hstate_index(h)];
+               nrg->css = &h_cg->css;
+               if (!resv->pages_per_hpage)
+                       resv->pages_per_hpage = pages_per_huge_page(h);
+               /* pages_per_hpage should be the same for all entries in
+                * a resv_map.
+                */
+               VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
+       } else {
+               nrg->reservation_counter = NULL;
+               nrg->css = NULL;
+       }
+#endif
+}
+
+static bool has_same_uncharge_info(struct file_region *rg,
+                                  struct file_region *org)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       return rg && org &&
+              rg->reservation_counter == org->reservation_counter &&
+              rg->css == org->css;
+
+#else
+       return true;
+#endif
+}
+
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
+{
+       struct file_region *nrg = NULL, *prg = NULL;
+
+       prg = list_prev_entry(rg, link);
+       if (&prg->link != &resv->regions && prg->to == rg->from &&
+           has_same_uncharge_info(prg, rg)) {
+               prg->to = rg->to;
+
+               list_del(&rg->link);
+               kfree(rg);
+
+               coalesce_file_region(resv, prg);
+               return;
+       }
+
+       nrg = list_next_entry(rg, link);
+       if (&nrg->link != &resv->regions && nrg->from == rg->to &&
+           has_same_uncharge_info(nrg, rg)) {
+               nrg->from = rg->from;
+
+               list_del(&rg->link);
+               kfree(rg);
+
+               coalesce_file_region(resv, nrg);
+               return;
+       }
+}
 
 /* Must be called with resv->lock held. Calling this with count_only == true
  * will count the number of pages to be added but will not modify the linked
- * list.
+ * list. If regions_needed != NULL and count_only == true, then regions_needed
+ * will indicate the number of file_regions needed in the cache to carry out to
+ * add the regions for this range.
  */
 static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+                                    struct hugetlb_cgroup *h_cg,
+                                    struct hstate *h, long *regions_needed,
                                     bool count_only)
 {
-       long chg = 0;
+       long add = 0;
        struct list_head *head = &resv->regions;
+       long last_accounted_offset = f;
        struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
 
-       /* Locate the region we are before or in. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
-
-       /* Round our left edge to the current segment if it encloses us. */
-       if (f > rg->from)
-               f = rg->from;
+       if (regions_needed)
+               *regions_needed = 0;
 
-       chg = t - f;
+       /* In this loop, we essentially handle an entry for the range
+        * [last_accounted_offset, rg->from), at every iteration, with some
+        * bounds checking.
+        */
+       list_for_each_entry_safe(rg, trg, head, link) {
+               /* Skip irrelevant regions that start before our range. */
+               if (rg->from < f) {
+                       /* If this region ends after the last accounted offset,
+                        * then we need to update last_accounted_offset.
+                        */
+                       if (rg->to > last_accounted_offset)
+                               last_accounted_offset = rg->to;
+                       continue;
+               }
 
-       /* Check for and consume any regions we now overlap with. */
-       nrg = rg;
-       list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
-               if (&rg->link == head)
-                       break;
+               /* When we find a region that starts beyond our range, we've
+                * finished.
+                */
                if (rg->from > t)
                        break;
 
-               /* We overlap with this area, if it extends further than
-                * us then we must extend ourselves.  Account for its
-                * existing reservation.
+               /* Add an entry for last_accounted_offset -> rg->from, and
+                * update last_accounted_offset.
                 */
-               if (rg->to > t) {
-                       chg += rg->to - t;
-                       t = rg->to;
+               if (rg->from > last_accounted_offset) {
+                       add += rg->from - last_accounted_offset;
+                       if (!count_only) {
+                               nrg = get_file_region_entry_from_cache(
+                                       resv, last_accounted_offset, rg->from);
+                               record_hugetlb_cgroup_uncharge_info(h_cg, h,
+                                                                   resv, nrg);
+                               list_add(&nrg->link, rg->link.prev);
+                               coalesce_file_region(resv, nrg);
+                       } else if (regions_needed)
+                               *regions_needed += 1;
                }
-               chg -= rg->to - rg->from;
 
-               if (!count_only && rg != nrg) {
+               last_accounted_offset = rg->to;
+       }
+
+       /* Handle the case where our range extends beyond
+        * last_accounted_offset.
+        */
+       if (last_accounted_offset < t) {
+               add += t - last_accounted_offset;
+               if (!count_only) {
+                       nrg = get_file_region_entry_from_cache(
+                               resv, last_accounted_offset, t);
+                       record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
+                       list_add(&nrg->link, rg->link.prev);
+                       coalesce_file_region(resv, nrg);
+               } else if (regions_needed)
+                       *regions_needed += 1;
+       }
+
+       VM_BUG_ON(add < 0);
+       return add;
+}
+
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
+ */
+static int allocate_file_region_entries(struct resv_map *resv,
+                                       int regions_needed)
+       __must_hold(&resv->lock)
+{
+       struct list_head allocated_regions;
+       int to_allocate = 0, i = 0;
+       struct file_region *trg = NULL, *rg = NULL;
+
+       VM_BUG_ON(regions_needed < 0);
+
+       INIT_LIST_HEAD(&allocated_regions);
+
+       /*
+        * Check for sufficient descriptors in the cache to accommodate
+        * the number of in progress add operations plus regions_needed.
+        *
+        * This is a while loop because when we drop the lock, some other call
+        * to region_add or region_del may have consumed some region_entries,
+        * so we keep looping here until we finally have enough entries for
+        * (adds_in_progress + regions_needed).
+        */
+       while (resv->region_cache_count <
+              (resv->adds_in_progress + regions_needed)) {
+               to_allocate = resv->adds_in_progress + regions_needed -
+                             resv->region_cache_count;
+
+               /* At this point, we should have enough entries in the cache
+                * for all the existings adds_in_progress. We should only be
+                * needing to allocate for regions_needed.
+                */
+               VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
+
+               spin_unlock(&resv->lock);
+               for (i = 0; i < to_allocate; i++) {
+                       trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+                       if (!trg)
+                               goto out_of_memory;
+                       list_add(&trg->link, &allocated_regions);
+               }
+
+               spin_lock(&resv->lock);
+
+               list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
                        list_del(&rg->link);
-                       kfree(rg);
+                       list_add(&rg->link, &resv->region_cache);
+                       resv->region_cache_count++;
                }
        }
 
-       if (!count_only) {
-               nrg->from = f;
-               nrg->to = t;
-       }
+       return 0;
 
-       return chg;
+out_of_memory:
+       list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
+               list_del(&rg->link);
+               kfree(rg);
+       }
+       return -ENOMEM;
 }
 
 /*
  * Add the huge page range represented by [f, t) to the reserve
- * map.  Existing regions will be expanded to accommodate the specified
- * range, or a region will be taken from the cache.  Sufficient regions
- * must exist in the cache due to the previous call to region_chg with
- * the same range.
+ * map.  Regions will be taken from the cache to fill in this range.
+ * Sufficient regions should exist in the cache due to the previous
+ * call to region_chg with the same range, but in some cases the cache will not
+ * have sufficient entries due to races with other code doing region_add or
+ * region_del.  The extra needed entries will be allocated.
  *
- * Return the number of new huge pages added to the map.  This
- * number is greater than or equal to zero.
+ * regions_needed is the out value provided by a previous call to region_chg.
+ *
+ * Return the number of new huge pages added to the map.  This number is greater
+ * than or equal to zero.  If file_region entries needed to be allocated for
+ * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * region_add of regions of length 1 never allocate file_regions and cannot
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
+ * 1 page will only require at most 1 entry.
  */
-static long region_add(struct resv_map *resv, long f, long t)
+static long region_add(struct resv_map *resv, long f, long t,
+                      long in_regions_needed, struct hstate *h,
+                      struct hugetlb_cgroup *h_cg)
 {
-       struct list_head *head = &resv->regions;
-       struct file_region *rg, *nrg;
-       long add = 0;
+       long add = 0, actual_regions_needed = 0;
 
        spin_lock(&resv->lock);
-       /* Locate the region we are either in or before. */
-       list_for_each_entry(rg, head, link)
-               if (f <= rg->to)
-                       break;
+retry:
+
+       /* Count how many regions are actually needed to execute this add. */
+       add_reservation_in_range(resv, f, t, NULL, NULL, &actual_regions_needed,
+                                true);
 
        /*
-        * If no region exists which can be expanded to include the
-        * specified range, pull a region descriptor from the cache
-        * and use it for this range.
+        * Check for sufficient descriptors in the cache to accommodate
+        * this add operation. Note that actual_regions_needed may be greater
+        * than in_regions_needed, as the resv_map may have been modified since
+        * the region_chg call. In this case, we need to make sure that we
+        * allocate extra entries, such that we have enough for all the
+        * existing adds_in_progress, plus the excess needed for this
+        * operation.
         */
-       if (&rg->link == head || t < rg->from) {
-               VM_BUG_ON(resv->region_cache_count <= 0);
-
-               resv->region_cache_count--;
-               nrg = list_first_entry(&resv->region_cache, struct file_region,
-                                       link);
-               list_del(&nrg->link);
+       if (actual_regions_needed > in_regions_needed &&
+           resv->region_cache_count <
+                   resv->adds_in_progress +
+                           (actual_regions_needed - in_regions_needed)) {
+               /* region_add operation of range 1 should never need to
+                * allocate file_region entries.
+                */
+               VM_BUG_ON(t - f <= 1);
 
-               nrg->from = f;
-               nrg->to = t;
-               list_add(&nrg->link, rg->link.prev);
+               if (allocate_file_region_entries(
+                           resv, actual_regions_needed - in_regions_needed)) {
+                       return -ENOMEM;
+               }
 
-               add += t - f;
-               goto out_locked;
+               goto retry;
        }
 
-       add = add_reservation_in_range(resv, f, t, false);
+       add = add_reservation_in_range(resv, f, t, h_cg, h, NULL, false);
+
+       resv->adds_in_progress -= in_regions_needed;
 
-out_locked:
-       resv->adds_in_progress--;
        spin_unlock(&resv->lock);
        VM_BUG_ON(add < 0);
        return add;
@@ -358,46 +533,37 @@ out_locked:
  * call to region_add that will actually modify the reserve
  * map to add the specified range [f, t).  region_chg does
  * not change the number of huge pages represented by the
- * map.  A new file_region structure is added to the cache
- * as a placeholder, so that the subsequent region_add
- * call will have all the regions it needs and will not fail.
+ * map.  A number of new file_region structures is added to the cache as a
+ * placeholder, for the subsequent region_add call to use. At least 1
+ * file_region structure is added.
+ *
+ * out_regions_needed is the number of regions added to the
+ * resv->adds_in_progress.  This value needs to be provided to a follow up call
+ * to region_add or region_abort for proper accounting.
  *
  * Returns the number of huge pages that need to be added to the existing
  * reservation map for the range [f, t).  This number is greater or equal to
  * zero.  -ENOMEM is returned if a new file_region structure or cache entry
  * is needed and can not be allocated.
  */
-static long region_chg(struct resv_map *resv, long f, long t)
+static long region_chg(struct resv_map *resv, long f, long t,
+                      long *out_regions_needed)
 {
        long chg = 0;
 
        spin_lock(&resv->lock);
-retry_locked:
-       resv->adds_in_progress++;
 
-       /*
-        * Check for sufficient descriptors in the cache to accommodate
-        * the number of in progress add operations.
-        */
-       if (resv->adds_in_progress > resv->region_cache_count) {
-               struct file_region *trg;
+       /* Count how many hugepages in this range are NOT respresented. */
+       chg = add_reservation_in_range(resv, f, t, NULL, NULL,
+                                      out_regions_needed, true);
 
-               VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
-               /* Must drop lock to allocate a new descriptor. */
-               resv->adds_in_progress--;
-               spin_unlock(&resv->lock);
+       if (*out_regions_needed == 0)
+               *out_regions_needed = 1;
 
-               trg = kmalloc(sizeof(*trg), GFP_KERNEL);
-               if (!trg)
-                       return -ENOMEM;
-
-               spin_lock(&resv->lock);
-               list_add(&trg->link, &resv->region_cache);
-               resv->region_cache_count++;
-               goto retry_locked;
-       }
+       if (allocate_file_region_entries(resv, *out_regions_needed))
+               return -ENOMEM;
 
-       chg = add_reservation_in_range(resv, f, t, true);
+       resv->adds_in_progress += *out_regions_needed;
 
        spin_unlock(&resv->lock);
        return chg;
@@ -408,17 +574,20 @@ retry_locked:
  * of the resv_map keeps track of the operations in progress between
  * calls to region_chg and region_add.  Operations are sometimes
  * aborted after the call to region_chg.  In such cases, region_abort
- * is called to decrement the adds_in_progress counter.
+ * is called to decrement the adds_in_progress counter. regions_needed
+ * is the value returned by the region_chg call, it is used to decrement
+ * the adds_in_progress counter.
  *
  * NOTE: The range arguments [f, t) are not needed or used in this
  * routine.  They are kept to make reading the calling code easier as
  * arguments will match the associated region_chg call.
  */
-static void region_abort(struct resv_map *resv, long f, long t)
+static void region_abort(struct resv_map *resv, long f, long t,
+                        long regions_needed)
 {
        spin_lock(&resv->lock);
        VM_BUG_ON(!resv->region_cache_count);
-       resv->adds_in_progress--;
+       resv->adds_in_progress -= regions_needed;
        spin_unlock(&resv->lock);
 }
 
@@ -486,11 +655,17 @@ retry:
                        /* New entry for end of split region */
                        nrg->from = t;
                        nrg->to = rg->to;
+
+                       copy_hugetlb_cgroup_uncharge_info(nrg, rg);
+
                        INIT_LIST_HEAD(&nrg->link);
 
                        /* Original entry is trimmed */
                        rg->to = f;
 
+                       hugetlb_cgroup_uncharge_file_region(
+                               resv, rg, nrg->to - nrg->from);
+
                        list_add(&nrg->link, &rg->link);
                        nrg = NULL;
                        break;
@@ -498,6 +673,8 @@ retry:
 
                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
+                       hugetlb_cgroup_uncharge_file_region(resv, rg,
+                                                           rg->to - rg->from);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
@@ -506,9 +683,15 @@ retry:
                if (f <= rg->from) {    /* Trim beginning of region */
                        del += t - rg->from;
                        rg->from = t;
+
+                       hugetlb_cgroup_uncharge_file_region(resv, rg,
+                                                           t - rg->from);
                } else {                /* Trim end of region */
                        del += rg->to - f;
                        rg->to = f;
+
+                       hugetlb_cgroup_uncharge_file_region(resv, rg,
+                                                           rg->to - f);
                }
        }
 
@@ -650,6 +833,25 @@ static void set_vma_private_data(struct vm_area_struct *vma,
        vma->vm_private_data = (void *)value;
 }
 
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+                                         struct hugetlb_cgroup *h_cg,
+                                         struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (!h_cg || !h) {
+               resv_map->reservation_counter = NULL;
+               resv_map->pages_per_hpage = 0;
+               resv_map->css = NULL;
+       } else {
+               resv_map->reservation_counter =
+                       &h_cg->rsvd_hugepage[hstate_index(h)];
+               resv_map->pages_per_hpage = pages_per_huge_page(h);
+               resv_map->css = &h_cg->css;
+       }
+#endif
+}
+
 struct resv_map *resv_map_alloc(void)
 {
        struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -666,6 +868,13 @@ struct resv_map *resv_map_alloc(void)
        INIT_LIST_HEAD(&resv_map->regions);
 
        resv_map->adds_in_progress = 0;
+       /*
+        * Initialize these to 0. On shared mappings, 0's here indicate these
+        * fields don't do cgroup accounting. On private mappings, these will be
+        * re-initialized to the proper values, to indicate that hugetlb cgroup
+        * reservations are to be un-charged from here.
+        */
+       resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
 
        INIT_LIST_HEAD(&resv_map->region_cache);
        list_add(&rg->link, &resv_map->region_cache);
@@ -1009,6 +1218,9 @@ static void destroy_compound_gigantic_page(struct page *page,
        struct page *p = page + 1;
 
        atomic_set(compound_mapcount_ptr(page), 0);
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
+
        for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                clear_compound_head(p);
                set_page_refcounted(p);
@@ -1020,6 +1232,14 @@ static void destroy_compound_gigantic_page(struct page *page,
 
 static void free_gigantic_page(struct page *page, unsigned int order)
 {
+       /*
+        * If the page isn't allocated using the cma allocator,
+        * cma_release() returns false.
+        */
+       if (IS_ENABLED(CONFIG_CMA) &&
+           cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+               return;
+
        free_contig_range(page_to_pfn(page), 1 << order);
 }
 
@@ -1029,6 +1249,21 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
 {
        unsigned long nr_pages = 1UL << huge_page_order(h);
 
+       if (IS_ENABLED(CONFIG_CMA)) {
+               struct page *page;
+               int node;
+
+               for_each_node_mask(node, *nodemask) {
+                       if (!hugetlb_cma[node])
+                               continue;
+
+                       page = cma_alloc(hugetlb_cma[node], nr_pages,
+                                        huge_page_order(h), true);
+                       if (page)
+                               return page;
+               }
+       }
+
        return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
 }
 
@@ -1069,11 +1304,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                1 << PG_writeback);
        }
        VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
        set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
        set_page_refcounted(page);
        if (hstate_is_gigantic(h)) {
+               /*
+                * Temporarily drop the hugetlb_lock, because
+                * we might block in free_gigantic_page().
+                */
+               spin_unlock(&hugetlb_lock);
                destroy_compound_gigantic_page(page, huge_page_order(h));
                free_gigantic_page(page, huge_page_order(h));
+               spin_lock(&hugetlb_lock);
        } else {
                __free_pages(page, huge_page_order(h));
        }
@@ -1180,6 +1422,8 @@ static void __free_huge_page(struct page *page)
        clear_page_huge_active(page);
        hugetlb_cgroup_uncharge_page(hstate_index(h),
                                     pages_per_huge_page(h), page);
+       hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+                                         pages_per_huge_page(h), page);
        if (restore_reserve)
                h->resv_huge_pages++;
 
@@ -1254,6 +1498,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        spin_lock(&hugetlb_lock);
        set_hugetlb_cgroup(page, NULL);
+       set_hugetlb_cgroup_rsvd(page, NULL);
        h->nr_huge_pages++;
        h->nr_huge_pages_node[nid]++;
        spin_unlock(&hugetlb_lock);
@@ -1287,6 +1532,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                set_compound_head(p, page);
        }
        atomic_set(compound_mapcount_ptr(page), -1);
+
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
 }
 
 /*
@@ -1313,7 +1561,107 @@ int PageHeadHuge(struct page *page_head)
        if (!PageHead(page_head))
                return 0;
 
-       return get_compound_page_dtor(page_head) == free_huge_page;
+       return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+
+/*
+ * Find address_space associated with hugetlbfs page.
+ * Upon entry page is locked and page 'was' mapped although mapped state
+ * could change.  If necessary, use anon_vma to find vma and associated
+ * address space.  The returned mapping may be stale, but it can not be
+ * invalid as page lock (which is held) is required to destroy mapping.
+ */
+static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
+{
+       struct anon_vma *anon_vma;
+       pgoff_t pgoff_start, pgoff_end;
+       struct anon_vma_chain *avc;
+       struct address_space *mapping = page_mapping(hpage);
+
+       /* Simple file based mapping */
+       if (mapping)
+               return mapping;
+
+       /*
+        * Even anonymous hugetlbfs mappings are associated with an
+        * underlying hugetlbfs file (see hugetlb_file_setup in mmap
+        * code).  Find a vma associated with the anonymous vma, and
+        * use the file pointer to get address_space.
+        */
+       anon_vma = page_lock_anon_vma_read(hpage);
+       if (!anon_vma)
+               return mapping;  /* NULL */
+
+       /* Use first found vma */
+       pgoff_start = page_to_pgoff(hpage);
+       pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                                       pgoff_start, pgoff_end) {
+               struct vm_area_struct *vma = avc->vma;
+
+               mapping = vma->vm_file->f_mapping;
+               break;
+       }
+
+       anon_vma_unlock_read(anon_vma);
+       return mapping;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which allows us to find the mapping
+ * even in the case of an anon page.  However, locking order dictates
+ * the i_mmap_rwsem be acquired BEFORE the page lock.  This is hugetlbfs
+ * specific.  So, we first try to lock the sema while still holding the
+ * page lock.  If this works, great!  If not, then we need to drop the
+ * page lock and then acquire i_mmap_rwsem and reacquire page lock.  Of
+ * course, need to revalidate state along the way.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+       struct address_space *mapping, *mapping2;
+
+       mapping = _get_hugetlb_page_mapping(hpage);
+retry:
+       if (!mapping)
+               return mapping;
+
+       /*
+        * If no contention, take lock and return
+        */
+       if (i_mmap_trylock_write(mapping))
+               return mapping;
+
+       /*
+        * Must drop page lock and wait on mapping sema.
+        * Note:  Once page lock is dropped, mapping could become invalid.
+        * As a hack, increase map count until we lock page again.
+        */
+       atomic_inc(&hpage->_mapcount);
+       unlock_page(hpage);
+       i_mmap_lock_write(mapping);
+       lock_page(hpage);
+       atomic_add_negative(-1, &hpage->_mapcount);
+
+       /* verify page is still mapped */
+       if (!page_mapped(hpage)) {
+               i_mmap_unlock_write(mapping);
+               return NULL;
+       }
+
+       /*
+        * Get address space again and verify it is the same one
+        * we locked.  If not, drop lock and retry.
+        */
+       mapping2 = _get_hugetlb_page_mapping(hpage);
+       if (mapping2 != mapping) {
+               i_mmap_unlock_write(mapping);
+               mapping = mapping2;
+               goto retry;
+       }
+
+       return mapping;
 }
 
 pgoff_t __basepage_index(struct page *page)
@@ -1695,6 +2043,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
  * of size 'delta'.
  */
 static int gather_surplus_pages(struct hstate *h, int delta)
+       __must_hold(&hugetlb_lock)
 {
        struct list_head surplus_list;
        struct page *page, *tmp;
@@ -1870,6 +2219,7 @@ static long __vma_reservation_common(struct hstate *h,
        struct resv_map *resv;
        pgoff_t idx;
        long ret;
+       long dummy_out_regions_needed;
 
        resv = vma_resv_map(vma);
        if (!resv)
@@ -1878,20 +2228,29 @@ static long __vma_reservation_common(struct hstate *h,
        idx = vma_hugecache_offset(h, vma, addr);
        switch (mode) {
        case VMA_NEEDS_RESV:
-               ret = region_chg(resv, idx, idx + 1);
+               ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
+               /* We assume that vma_reservation_* routines always operate on
+                * 1 page, and that adding to resv map a 1 page entry can only
+                * ever require 1 region.
+                */
+               VM_BUG_ON(dummy_out_regions_needed != 1);
                break;
        case VMA_COMMIT_RESV:
-               ret = region_add(resv, idx, idx + 1);
+               ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+               /* region_add calls of range 1 should never fail. */
+               VM_BUG_ON(ret < 0);
                break;
        case VMA_END_RESV:
-               region_abort(resv, idx, idx + 1);
+               region_abort(resv, idx, idx + 1, 1);
                ret = 0;
                break;
        case VMA_ADD_RESV:
-               if (vma->vm_flags & VM_MAYSHARE)
-                       ret = region_add(resv, idx, idx + 1);
-               else {
-                       region_abort(resv, idx, idx + 1);
+               if (vma->vm_flags & VM_MAYSHARE) {
+                       ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+                       /* region_add calls of range 1 should never fail. */
+                       VM_BUG_ON(ret < 0);
+               } else {
+                       region_abort(resv, idx, idx + 1, 1);
                        ret = region_del(resv, idx, idx + 1);
                }
                break;
@@ -2002,6 +2361,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
        long gbl_chg;
        int ret, idx;
        struct hugetlb_cgroup *h_cg;
+       bool deferred_reserve;
 
        idx = hstate_index(h);
        /*
@@ -2039,9 +2399,19 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                        gbl_chg = 1;
        }
 
+       /* If this allocation is not consuming a reservation, charge it now.
+        */
+       deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+       if (deferred_reserve) {
+               ret = hugetlb_cgroup_charge_cgroup_rsvd(
+                       idx, pages_per_huge_page(h), &h_cg);
+               if (ret)
+                       goto out_subpool_put;
+       }
+
        ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
        if (ret)
-               goto out_subpool_put;
+               goto out_uncharge_cgroup_reservation;
 
        spin_lock(&hugetlb_lock);
        /*
@@ -2064,6 +2434,14 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
                /* Fall through */
        }
        hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+       /* If allocation is not consuming a reservation, also store the
+        * hugetlb_cgroup pointer on the page.
+        */
+       if (deferred_reserve) {
+               hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
+                                                 h_cg, page);
+       }
+
        spin_unlock(&hugetlb_lock);
 
        set_page_private(page, (unsigned long)spool);
@@ -2088,6 +2466,10 @@ struct page *alloc_huge_page(struct vm_area_struct *vma,
 
 out_uncharge_cgroup:
        hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_uncharge_cgroup_reservation:
+       if (deferred_reserve)
+               hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
+                                                   h_cg);
 out_subpool_put:
        if (map_chg || avoid_reserve)
                hugepage_subpool_put_pages(spool, 1);
@@ -2190,6 +2572,10 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
 
        for (i = 0; i < h->max_huge_pages; ++i) {
                if (hstate_is_gigantic(h)) {
+                       if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) {
+                               pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+                               break;
+                       }
                        if (!alloc_bootmem_huge_page(h))
                                break;
                } else if (!alloc_pool_huge_page(h,
@@ -2845,6 +3231,7 @@ static int __init hugetlb_init(void)
                        default_hstate.max_huge_pages = default_hstate_max_huge_pages;
        }
 
+       hugetlb_cma_check();
        hugetlb_init_hstates();
        gather_bootmem_prealloc();
        report_hugepages();
@@ -3188,9 +3575,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
        end = vma_hugecache_offset(h, vma, vma->vm_end);
 
        reserve = (end - start) - region_count(resv, start, end);
-
-       kref_put(&resv->refs, resv_map_release);
-
+       hugetlb_cgroup_uncharge_counter(resv, start, end);
        if (reserve) {
                /*
                 * Decrement reserve counts.  The global reserve count may be
@@ -3199,6 +3584,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
                hugetlb_acct_memory(h, -gbl_reserve);
        }
+
+       kref_put(&resv->refs, resv_map_release);
 }
 
 static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -3306,6 +3693,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
        int cow;
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+       struct address_space *mapping = vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;
 
@@ -3316,6 +3704,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                                        vma->vm_start,
                                        vma->vm_end);
                mmu_notifier_invalidate_range_start(&range);
+       } else {
+               /*
+                * For shared mappings i_mmap_rwsem must be held to call
+                * huge_pte_alloc, otherwise the returned ptep could go
+                * away if part of a shared pmd and another thread calls
+                * huge_pmd_unshare.
+                */
+               i_mmap_lock_read(mapping);
        }
 
        for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
@@ -3393,6 +3789,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
 
        if (cow)
                mmu_notifier_invalidate_range_end(&range);
+       else
+               i_mmap_unlock_read(mapping);
 
        return ret;
 }
@@ -3812,16 +4210,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        }
 
        /*
-        * Use page lock to guard against racing truncation
-        * before we get page_table_lock.
+        * We can not race with truncation due to holding i_mmap_rwsem.
+        * i_size is modified when holding i_mmap_rwsem, so check here
+        * once for faults beyond end of file.
         */
+       size = i_size_read(mapping->host) >> huge_page_shift(h);
+       if (idx >= size)
+               goto out;
+
 retry:
        page = find_lock_page(mapping, idx);
        if (!page) {
-               size = i_size_read(mapping->host) >> huge_page_shift(h);
-               if (idx >= size)
-                       goto out;
-
                /*
                 * Check for page in userfault range
                 */
@@ -3841,13 +4240,15 @@ retry:
                        };
 
                        /*
-                        * hugetlb_fault_mutex must be dropped before
-                        * handling userfault.  Reacquire after handling
-                        * fault to make calling code simpler.
+                        * hugetlb_fault_mutex and i_mmap_rwsem must be
+                        * dropped before handling userfault.  Reacquire
+                        * after handling fault to make calling code simpler.
                         */
                        hash = hugetlb_fault_mutex_hash(mapping, idx);
                        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       i_mmap_unlock_read(mapping);
                        ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+                       i_mmap_lock_read(mapping);
                        mutex_lock(&hugetlb_fault_mutex_table[hash]);
                        goto out;
                }
@@ -3925,10 +4326,6 @@ retry:
        }
 
        ptl = huge_pte_lock(h, mm, ptep);
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto backout;
-
        ret = 0;
        if (!huge_pte_none(huge_ptep_get(ptep)))
                goto backout;
@@ -4012,6 +4409,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
        if (ptep) {
+               /*
+                * Since we hold no locks, ptep could be stale.  That is
+                * OK as we are only making decisions based on content and
+                * not actually modifying content here.
+                */
                entry = huge_ptep_get(ptep);
                if (unlikely(is_hugetlb_entry_migration(entry))) {
                        migration_entry_wait_huge(vma, mm, ptep);
@@ -4025,14 +4427,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                        return VM_FAULT_OOM;
        }
 
+       /*
+        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+        * until finished with ptep.  This serves two purposes:
+        * 1) It prevents huge_pmd_unshare from being called elsewhere
+        *    and making the ptep no longer valid.
+        * 2) It synchronizes us with i_size modifications during truncation.
+        *
+        * ptep could have already be assigned via huge_pte_offset.  That
+        * is OK, as huge_pte_alloc will return the same value unless
+        * something has changed.
+        */
        mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, haddr);
+       i_mmap_lock_read(mapping);
+       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       if (!ptep) {
+               i_mmap_unlock_read(mapping);
+               return VM_FAULT_OOM;
+       }
 
        /*
         * Serialize hugepage allocation and instantiation, so that we don't
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
+       idx = vma_hugecache_offset(h, vma, haddr);
        hash = hugetlb_fault_mutex_hash(mapping, idx);
        mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
@@ -4120,6 +4539,7 @@ out_ptl:
        }
 out_mutex:
        mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
        /*
         * Generally it's safe to hold refcount during waiting page lock. But
         * here we just wait to defer the next page fault to avoid busy loop and
@@ -4266,7 +4686,7 @@ out_release_nounlock:
 long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                         struct page **pages, struct vm_area_struct **vmas,
                         unsigned long *position, unsigned long *nr_pages,
-                        long i, unsigned int flags, int *nonblocking)
+                        long i, unsigned int flags, int *locked)
 {
        unsigned long pfn_offset;
        unsigned long vaddr = *position;
@@ -4337,14 +4757,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                spin_unlock(ptl);
                        if (flags & FOLL_WRITE)
                                fault_flags |= FAULT_FLAG_WRITE;
-                       if (nonblocking)
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+                       if (locked)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+                                       FAULT_FLAG_KILLABLE;
                        if (flags & FOLL_NOWAIT)
                                fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                        FAULT_FLAG_RETRY_NOWAIT;
                        if (flags & FOLL_TRIED) {
-                               VM_WARN_ON_ONCE(fault_flags &
-                                               FAULT_FLAG_ALLOW_RETRY);
+                               /*
+                                * Note: FAULT_FLAG_ALLOW_RETRY and
+                                * FAULT_FLAG_TRIED can co-exist
+                                */
                                fault_flags |= FAULT_FLAG_TRIED;
                        }
                        ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
@@ -4354,9 +4777,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                break;
                        }
                        if (ret & VM_FAULT_RETRY) {
-                               if (nonblocking &&
+                               if (locked &&
                                    !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
-                                       *nonblocking = 0;
+                                       *locked = 0;
                                *nr_pages = 0;
                                /*
                                 * VM_FAULT_RETRY must not return an
@@ -4375,19 +4798,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
                page = pte_page(huge_ptep_get(pte));
 
-               /*
-                * Instead of doing 'try_get_page()' below in the same_page
-                * loop, just check the count once here.
-                */
-               if (unlikely(page_count(page) <= 0)) {
-                       if (pages) {
-                               spin_unlock(ptl);
-                               remainder = 0;
-                               err = -ENOMEM;
-                               break;
-                       }
-               }
-
                /*
                 * If subpage information not requested, update counters
                 * and skip the same_page loop below.
@@ -4405,7 +4815,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
 same_page:
                if (pages) {
                        pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page(pages[i]);
+                       /*
+                        * try_grab_page() should always succeed here, because:
+                        * a) we hold the ptl lock, and b) we've just checked
+                        * that the huge page is present in the page tables. If
+                        * the huge page is present, then the tail pages must
+                        * also be present. The ptl prevents the head page and
+                        * tail pages from being rearranged in any way. So this
+                        * page must be available at this point, unless the page
+                        * refcount overflowed:
+                        */
+                       if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+                               spin_unlock(ptl);
+                               remainder = 0;
+                               err = -ENOMEM;
+                               break;
+                       }
                }
 
                if (vmas)
@@ -4541,11 +4966,12 @@ int hugetlb_reserve_pages(struct inode *inode,
                                        struct vm_area_struct *vma,
                                        vm_flags_t vm_flags)
 {
-       long ret, chg;
+       long ret, chg, add = -1;
        struct hstate *h = hstate_inode(inode);
        struct hugepage_subpool *spool = subpool_inode(inode);
        struct resv_map *resv_map;
-       long gbl_reserve;
+       struct hugetlb_cgroup *h_cg = NULL;
+       long gbl_reserve, regions_needed = 0;
 
        /* This should never happen */
        if (from > to) {
@@ -4575,9 +5001,10 @@ int hugetlb_reserve_pages(struct inode *inode,
                 */
                resv_map = inode_resv_map(inode);
 
-               chg = region_chg(resv_map, from, to);
+               chg = region_chg(resv_map, from, to, &regions_needed);
 
        } else {
+               /* Private mapping. */
                resv_map = resv_map_alloc();
                if (!resv_map)
                        return -ENOMEM;
@@ -4593,6 +5020,21 @@ int hugetlb_reserve_pages(struct inode *inode,
                goto out_err;
        }
 
+       ret = hugetlb_cgroup_charge_cgroup_rsvd(
+               hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
+
+       if (ret < 0) {
+               ret = -ENOMEM;
+               goto out_err;
+       }
+
+       if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+               /* For private mappings, the hugetlb_cgroup uncharge info hangs
+                * of the resv_map.
+                */
+               resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+       }
+
        /*
         * There must be enough pages in the subpool for the mapping. If
         * the subpool has a minimum size, there may be some global
@@ -4601,7 +5043,7 @@ int hugetlb_reserve_pages(struct inode *inode,
        gbl_reserve = hugepage_subpool_get_pages(spool, chg);
        if (gbl_reserve < 0) {
                ret = -ENOSPC;
-               goto out_err;
+               goto out_uncharge_cgroup;
        }
 
        /*
@@ -4610,9 +5052,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         */
        ret = hugetlb_acct_memory(h, gbl_reserve);
        if (ret < 0) {
-               /* put back original number of pages, chg */
-               (void)hugepage_subpool_put_pages(spool, chg);
-               goto out_err;
+               goto out_put_pages;
        }
 
        /*
@@ -4627,9 +5067,12 @@ int hugetlb_reserve_pages(struct inode *inode,
         * else has to be done for private mappings here
         */
        if (!vma || vma->vm_flags & VM_MAYSHARE) {
-               long add = region_add(resv_map, from, to);
+               add = region_add(resv_map, from, to, regions_needed, h, h_cg);
 
-               if (unlikely(chg > add)) {
+               if (unlikely(add < 0)) {
+                       hugetlb_acct_memory(h, -gbl_reserve);
+                       goto out_put_pages;
+               } else if (unlikely(chg > add)) {
                        /*
                         * pages in this range were added to the reserve
                         * map between region_chg and region_add.  This
@@ -4639,17 +5082,29 @@ int hugetlb_reserve_pages(struct inode *inode,
                         */
                        long rsv_adjust;
 
+                       hugetlb_cgroup_uncharge_cgroup_rsvd(
+                               hstate_index(h),
+                               (chg - add) * pages_per_huge_page(h), h_cg);
+
                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
                }
        }
        return 0;
+out_put_pages:
+       /* put back original number of pages, chg */
+       (void)hugepage_subpool_put_pages(spool, chg);
+out_uncharge_cgroup:
+       hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
+                                           chg * pages_per_huge_page(h), h_cg);
 out_err:
        if (!vma || vma->vm_flags & VM_MAYSHARE)
-               /* Don't call region_abort if region_chg failed */
-               if (chg >= 0)
-                       region_abort(resv_map, from, to);
+               /* Only call region_abort if the region_chg succeeded but the
+                * region_add failed or didn't run.
+                */
+               if (chg >= 0 && add < 0)
+                       region_abort(resv_map, from, to, regions_needed);
        if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
                kref_put(&resv_map->refs, resv_map_release);
        return ret;
@@ -4740,7 +5195,7 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
 void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
                                unsigned long *start, unsigned long *end)
 {
-       unsigned long check_addr = *start;
+       unsigned long check_addr;
 
        if (!(vma->vm_flags & VM_MAYSHARE))
                return;
@@ -4765,10 +5220,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
  * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
  * and returns the corresponding pte. While this is not necessary for the
  * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space.  This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
  */
 pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
 {
@@ -4785,7 +5242,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!vma_shareable(vma, addr))
                return (pte_t *)pmd_alloc(mm, pud, addr);
 
-       i_mmap_lock_read(mapping);
        vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                if (svma == vma)
                        continue;
@@ -4815,7 +5271,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        spin_unlock(ptl);
 out:
        pte = (pte_t *)pmd_alloc(mm, pud, addr);
-       i_mmap_unlock_read(mapping);
        return pte;
 }
 
@@ -4826,7 +5281,7 @@ out:
  * indicated by page_count > 1, unmap is achieved by clearing pud and
  * decrementing the ref count. If count == 1, the pte page is not shared.
  *
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
  *
  * returns: 1 successfully unmapped a shared pte page
  *         0 the underlying pte page is not shared, or it is the last user
@@ -4965,6 +5420,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
        struct page *page = NULL;
        spinlock_t *ptl;
        pte_t pte;
+
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+                        (FOLL_PIN | FOLL_GET)))
+               return NULL;
+
 retry:
        ptl = pmd_lockptr(mm, pmd);
        spin_lock(ptl);
@@ -4977,8 +5438,18 @@ retry:
        pte = huge_ptep_get((pte_t *)pmd);
        if (pte_present(pte)) {
                page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-               if (flags & FOLL_GET)
-                       get_page(page);
+               /*
+                * try_grab_page() should always succeed here, because: a) we
+                * hold the pmd (ptl) lock, and b) we've just checked that the
+                * huge pmd (head) page is present in the page tables. The ptl
+                * prevents the head page and tail pages from being rearranged
+                * in any way. So this page must be available at this point,
+                * unless the page refcount overflowed:
+                */
+               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+                       page = NULL;
+                       goto out;
+               }
        } else {
                if (is_hugetlb_entry_migration(pte)) {
                        spin_unlock(ptl);
@@ -4999,7 +5470,7 @@ struct page * __weak
 follow_huge_pud(struct mm_struct *mm, unsigned long address,
                pud_t *pud, int flags)
 {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;
 
        return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5008,7 +5479,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
 struct page * __weak
 follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
 {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                return NULL;
 
        return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -5073,3 +5544,74 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
                spin_unlock(&hugetlb_lock);
        }
 }
+
+#ifdef CONFIG_CMA
+static unsigned long hugetlb_cma_size __initdata;
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+       hugetlb_cma_size = memparse(p, &p);
+       return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+void __init hugetlb_cma_reserve(int order)
+{
+       unsigned long size, reserved, per_node;
+       int nid;
+
+       cma_reserve_called = true;
+
+       if (!hugetlb_cma_size)
+               return;
+
+       if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+               pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+                       (PAGE_SIZE << order) / SZ_1M);
+               return;
+       }
+
+       /*
+        * If 3 GB area is requested on a machine with 4 numa nodes,
+        * let's allocate 1 GB on first three nodes and ignore the last one.
+        */
+       per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+       pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+               hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+
+       reserved = 0;
+       for_each_node_state(nid, N_ONLINE) {
+               int res;
+
+               size = min(per_node, hugetlb_cma_size - reserved);
+               size = round_up(size, PAGE_SIZE << order);
+
+               res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+                                                0, false, "hugetlb",
+                                                &hugetlb_cma[nid], nid);
+               if (res) {
+                       pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+                               res, nid);
+                       continue;
+               }
+
+               reserved += size;
+               pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+                       size / SZ_1M, nid);
+
+               if (reserved >= hugetlb_cma_size)
+                       break;
+       }
+}
+
+void __init hugetlb_cma_check(void)
+{
+       if (!hugetlb_cma_size || cma_reserve_called)
+               return;
+
+       pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+#endif /* CONFIG_CMA */