mm, slub: move disabling irqs closer to get_partial() in ___slab_alloc()
[linux-2.6-microblaze.git] / mm / hugetlb.c
index 103f118..8ea35ba 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/numa.h>
 #include <linux/llist.h>
 #include <linux/cma.h>
+#include <linux/migrate.h>
 
 #include <asm/page.h>
 #include <asm/pgalloc.h>
@@ -41,6 +42,7 @@
 #include <linux/node.h>
 #include <linux/page_owner.h>
 #include "internal.h"
+#include "hugetlb_vmemmap.h"
 
 int hugetlb_max_hstate __read_mostly;
 unsigned int default_hstate_idx;
@@ -1318,8 +1320,6 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
        return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
 }
 
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
 #else /* !CONFIG_CONTIG_ALLOC */
 static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
                                        int nid, nodemask_t *nodemask)
@@ -1375,7 +1375,40 @@ static void remove_hugetlb_page(struct hstate *h, struct page *page,
        h->nr_huge_pages_node[nid]--;
 }
 
-static void update_and_free_page(struct hstate *h, struct page *page)
+static void add_hugetlb_page(struct hstate *h, struct page *page,
+                            bool adjust_surplus)
+{
+       int zeroed;
+       int nid = page_to_nid(page);
+
+       VM_BUG_ON_PAGE(!HPageVmemmapOptimized(page), page);
+
+       lockdep_assert_held(&hugetlb_lock);
+
+       INIT_LIST_HEAD(&page->lru);
+       h->nr_huge_pages++;
+       h->nr_huge_pages_node[nid]++;
+
+       if (adjust_surplus) {
+               h->surplus_huge_pages++;
+               h->surplus_huge_pages_node[nid]++;
+       }
+
+       set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
+       set_page_private(page, 0);
+       SetHPageVmemmapOptimized(page);
+
+       /*
+        * This page is now managed by the hugetlb allocator and has
+        * no users -- drop the last reference.
+        */
+       zeroed = put_page_testzero(page);
+       VM_BUG_ON_PAGE(!zeroed, page);
+       arch_clear_hugepage_flags(page);
+       enqueue_huge_page(h, page);
+}
+
+static void __update_and_free_page(struct hstate *h, struct page *page)
 {
        int i;
        struct page *subpage = page;
@@ -1383,6 +1416,18 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
                return;
 
+       if (alloc_huge_page_vmemmap(h, page)) {
+               spin_lock_irq(&hugetlb_lock);
+               /*
+                * If we cannot allocate vmemmap pages, just refuse to free the
+                * page and put the page back on the hugetlb free list and treat
+                * as a surplus page.
+                */
+               add_hugetlb_page(h, page, true);
+               spin_unlock_irq(&hugetlb_lock);
+               return;
+       }
+
        for (i = 0; i < pages_per_huge_page(h);
             i++, subpage = mem_map_next(subpage, page, i)) {
                subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
@@ -1398,12 +1443,79 @@ static void update_and_free_page(struct hstate *h, struct page *page)
        }
 }
 
+/*
+ * As update_and_free_page() can be called under any context, so we cannot
+ * use GFP_KERNEL to allocate vmemmap pages. However, we can defer the
+ * actual freeing in a workqueue to prevent from using GFP_ATOMIC to allocate
+ * the vmemmap pages.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to be
+ * freed and frees them one-by-one. As the page->mapping pointer is going
+ * to be cleared in free_hpage_workfn() anyway, it is reused as the llist_node
+ * structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+       struct llist_node *node;
+
+       node = llist_del_all(&hpage_freelist);
+
+       while (node) {
+               struct page *page;
+               struct hstate *h;
+
+               page = container_of((struct address_space **)node,
+                                    struct page, mapping);
+               node = node->next;
+               page->mapping = NULL;
+               /*
+                * The VM_BUG_ON_PAGE(!PageHuge(page), page) in page_hstate()
+                * is going to trigger because a previous call to
+                * remove_hugetlb_page() will set_compound_page_dtor(page,
+                * NULL_COMPOUND_DTOR), so do not use page_hstate() directly.
+                */
+               h = size_to_hstate(page_size(page));
+
+               __update_and_free_page(h, page);
+
+               cond_resched();
+       }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+static inline void flush_free_hpage_work(struct hstate *h)
+{
+       if (free_vmemmap_pages_per_hpage(h))
+               flush_work(&free_hpage_work);
+}
+
+static void update_and_free_page(struct hstate *h, struct page *page,
+                                bool atomic)
+{
+       if (!HPageVmemmapOptimized(page) || !atomic) {
+               __update_and_free_page(h, page);
+               return;
+       }
+
+       /*
+        * Defer freeing to avoid using GFP_ATOMIC to allocate vmemmap pages.
+        *
+        * Only call schedule_work() if hpage_freelist is previously
+        * empty. Otherwise, schedule_work() had been called but the workfn
+        * hasn't retrieved the list yet.
+        */
+       if (llist_add((struct llist_node *)&page->mapping, &hpage_freelist))
+               schedule_work(&free_hpage_work);
+}
+
 static void update_and_free_pages_bulk(struct hstate *h, struct list_head *list)
 {
        struct page *page, *t_page;
 
        list_for_each_entry_safe(page, t_page, list, lru) {
-               update_and_free_page(h, page);
+               update_and_free_page(h, page, false);
                cond_resched();
        }
 }
@@ -1470,12 +1582,12 @@ void free_huge_page(struct page *page)
        if (HPageTemporary(page)) {
                remove_hugetlb_page(h, page, false);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
-               update_and_free_page(h, page);
+               update_and_free_page(h, page, true);
        } else if (h->surplus_huge_pages_node[nid]) {
                /* remove the page from active list */
                remove_hugetlb_page(h, page, true);
                spin_unlock_irqrestore(&hugetlb_lock, flags);
-               update_and_free_page(h, page);
+               update_and_free_page(h, page, true);
        } else {
                arch_clear_hugepage_flags(page);
                enqueue_huge_page(h, page);
@@ -1493,8 +1605,9 @@ static void __prep_account_new_huge_page(struct hstate *h, int nid)
        h->nr_huge_pages_node[nid]++;
 }
 
-static void __prep_new_huge_page(struct page *page)
+static void __prep_new_huge_page(struct hstate *h, struct page *page)
 {
+       free_huge_page_vmemmap(h, page);
        INIT_LIST_HEAD(&page->lru);
        set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
        hugetlb_set_page_subpool(page, NULL);
@@ -1504,15 +1617,15 @@ static void __prep_new_huge_page(struct page *page)
 
 static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
 {
-       __prep_new_huge_page(page);
+       __prep_new_huge_page(h, page);
        spin_lock_irq(&hugetlb_lock);
        __prep_account_new_huge_page(h, nid);
        spin_unlock_irq(&hugetlb_lock);
 }
 
-static void prep_compound_gigantic_page(struct page *page, unsigned int order)
+static bool prep_compound_gigantic_page(struct page *page, unsigned int order)
 {
-       int i;
+       int i, j;
        int nr_pages = 1 << order;
        struct page *p = page + 1;
 
@@ -1534,11 +1647,48 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                 * after get_user_pages().
                 */
                __ClearPageReserved(p);
+               /*
+                * Subtle and very unlikely
+                *
+                * Gigantic 'page allocators' such as memblock or cma will
+                * return a set of pages with each page ref counted.  We need
+                * to turn this set of pages into a compound page with tail
+                * page ref counts set to zero.  Code such as speculative page
+                * cache adding could take a ref on a 'to be' tail page.
+                * We need to respect any increased ref count, and only set
+                * the ref count to zero if count is currently 1.  If count
+                * is not 1, we call synchronize_rcu in the hope that a rcu
+                * grace period will cause ref count to drop and then retry.
+                * If count is still inflated on retry we return an error and
+                * must discard the pages.
+                */
+               if (!page_ref_freeze(p, 1)) {
+                       pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n");
+                       synchronize_rcu();
+                       if (!page_ref_freeze(p, 1))
+                               goto out_error;
+               }
                set_page_count(p, 0);
                set_compound_head(p, page);
        }
        atomic_set(compound_mapcount_ptr(page), -1);
        atomic_set(compound_pincount_ptr(page), 0);
+       return true;
+
+out_error:
+       /* undo tail page modifications made above */
+       p = page + 1;
+       for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) {
+               clear_compound_head(p);
+               set_page_refcounted(p);
+       }
+       /* need to clear PG_reserved on remaining tail pages  */
+       for (; j < nr_pages; j++, p = mem_map_next(p, page, j))
+               __ClearPageReserved(p);
+       set_compound_order(page, 0);
+       page[1].compound_nr = 0;
+       __ClearPageHead(page);
+       return false;
 }
 
 /*
@@ -1658,7 +1808,9 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
                nodemask_t *node_alloc_noretry)
 {
        struct page *page;
+       bool retry = false;
 
+retry:
        if (hstate_is_gigantic(h))
                page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
        else
@@ -1667,8 +1819,21 @@ static struct page *alloc_fresh_huge_page(struct hstate *h,
        if (!page)
                return NULL;
 
-       if (hstate_is_gigantic(h))
-               prep_compound_gigantic_page(page, huge_page_order(h));
+       if (hstate_is_gigantic(h)) {
+               if (!prep_compound_gigantic_page(page, huge_page_order(h))) {
+                       /*
+                        * Rare failure to convert pages to compound page.
+                        * Free pages and try again - ONCE!
+                        */
+                       free_gigantic_page(page, huge_page_order(h));
+                       if (!retry) {
+                               retry = true;
+                               goto retry;
+                       }
+                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+                       return NULL;
+               }
+       }
        prep_new_huge_page(h, page, page_to_nid(page));
 
        return page;
@@ -1737,10 +1902,14 @@ static struct page *remove_pool_huge_page(struct hstate *h,
  * nothing for in-use hugepages and non-hugepages.
  * This function returns values like below:
  *
- *  -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
- *          (allocated or reserved.)
- *       0: successfully dissolved free hugepages or the page is not a
- *          hugepage (considered as already dissolved)
+ *  -ENOMEM: failed to allocate vmemmap pages to free the freed hugepages
+ *           when the system is under memory pressure and the feature of
+ *           freeing unused vmemmap pages associated with each hugetlb page
+ *           is enabled.
+ *  -EBUSY:  failed to dissolved free hugepages or the hugepage is in-use
+ *           (allocated or reserved.)
+ *       0:  successfully dissolved free hugepages or the page is not a
+ *           hugepage (considered as already dissolved)
  */
 int dissolve_free_huge_page(struct page *page)
 {
@@ -1782,19 +1951,38 @@ retry:
                        goto retry;
                }
 
-               /*
-                * Move PageHWPoison flag from head page to the raw error page,
-                * which makes any subpages rather than the error page reusable.
-                */
-               if (PageHWPoison(head) && page != head) {
-                       SetPageHWPoison(page);
-                       ClearPageHWPoison(head);
-               }
                remove_hugetlb_page(h, head, false);
                h->max_huge_pages--;
                spin_unlock_irq(&hugetlb_lock);
-               update_and_free_page(h, head);
-               return 0;
+
+               /*
+                * Normally update_and_free_page will allocate required vmemmmap
+                * before freeing the page.  update_and_free_page will fail to
+                * free the page if it can not allocate required vmemmap.  We
+                * need to adjust max_huge_pages if the page is not freed.
+                * Attempt to allocate vmemmmap here so that we can take
+                * appropriate action on failure.
+                */
+               rc = alloc_huge_page_vmemmap(h, head);
+               if (!rc) {
+                       /*
+                        * Move PageHWPoison flag from head page to the raw
+                        * error page, which makes any subpages rather than
+                        * the error page reusable.
+                        */
+                       if (PageHWPoison(head) && page != head) {
+                               SetPageHWPoison(page);
+                               ClearPageHWPoison(head);
+                       }
+                       update_and_free_page(h, head, false);
+               } else {
+                       spin_lock_irq(&hugetlb_lock);
+                       add_hugetlb_page(h, head, false);
+                       h->max_huge_pages++;
+                       spin_unlock_irq(&hugetlb_lock);
+               }
+
+               return rc;
        }
 out:
        spin_unlock_irq(&hugetlb_lock);
@@ -2288,7 +2476,7 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma,
                if (!rc) {
                        /*
                         * This indicates there is an entry in the reserve map
-                        * added by alloc_huge_page.  We know it was added
+                        * not added by alloc_huge_page.  We know it was added
                         * before the alloc_huge_page call, otherwise
                         * HPageRestoreReserve would be set on the page.
                         * Remove the entry so that a subsequent allocation
@@ -2351,14 +2539,15 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page,
 
        /*
         * Before dissolving the page, we need to allocate a new one for the
-        * pool to remain stable. Using alloc_buddy_huge_page() allows us to
-        * not having to deal with prep_new_huge_page() and avoids dealing of any
-        * counters. This simplifies and let us do the whole thing under the
-        * lock.
+        * pool to remain stable.  Here, we allocate the page and 'prep' it
+        * by doing everything but actually updating counters and adding to
+        * the pool.  This simplifies and let us do most of the processing
+        * under the lock.
         */
        new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL);
        if (!new_page)
                return -ENOMEM;
+       __prep_new_huge_page(h, new_page);
 
 retry:
        spin_lock_irq(&hugetlb_lock);
@@ -2397,14 +2586,9 @@ retry:
                remove_hugetlb_page(h, old_page, false);
 
                /*
-                * new_page needs to be initialized with the standard hugetlb
-                * state. This is normally done by prep_new_huge_page() but
-                * that takes hugetlb_lock which is already held so we need to
-                * open code it here.
                 * Reference count trick is needed because allocator gives us
                 * referenced page but the pool requires pages with 0 refcount.
                 */
-               __prep_new_huge_page(new_page);
                __prep_account_new_huge_page(h, nid);
                page_ref_dec(new_page);
                enqueue_huge_page(h, new_page);
@@ -2413,14 +2597,14 @@ retry:
                 * Pages have been replaced, we can safely free the old one.
                 */
                spin_unlock_irq(&hugetlb_lock);
-               update_and_free_page(h, old_page);
+               update_and_free_page(h, old_page, false);
        }
 
        return ret;
 
 free_new:
        spin_unlock_irq(&hugetlb_lock);
-       __free_pages(new_page, huge_page_order(h));
+       update_and_free_page(h, new_page, false);
 
        return ret;
 }
@@ -2625,16 +2809,10 @@ found:
        return 1;
 }
 
-static void __init prep_compound_huge_page(struct page *page,
-               unsigned int order)
-{
-       if (unlikely(order > (MAX_ORDER - 1)))
-               prep_compound_gigantic_page(page, order);
-       else
-               prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
 static void __init gather_bootmem_prealloc(void)
 {
        struct huge_bootmem_page *m;
@@ -2643,20 +2821,23 @@ static void __init gather_bootmem_prealloc(void)
                struct page *page = virt_to_page(m);
                struct hstate *h = m->hstate;
 
+               VM_BUG_ON(!hstate_is_gigantic(h));
                WARN_ON(page_count(page) != 1);
-               prep_compound_huge_page(page, huge_page_order(h));
-               WARN_ON(PageReserved(page));
-               prep_new_huge_page(h, page, page_to_nid(page));
-               put_page(page); /* free it into the hugepage allocator */
+               if (prep_compound_gigantic_page(page, huge_page_order(h))) {
+                       WARN_ON(PageReserved(page));
+                       prep_new_huge_page(h, page, page_to_nid(page));
+                       put_page(page); /* add to the hugepage allocator */
+               } else {
+                       free_gigantic_page(page, huge_page_order(h));
+                       pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
+               }
 
                /*
-                * If we had gigantic hugepages allocated at boot time, we need
-                * to restore the 'stolen' pages to totalram_pages in order to
-                * fix confusing memory reports from free(1) and another
-                * side-effects, like CommitLimit going negative.
+                * We need to restore the 'stolen' pages to totalram_pages
+                * in order to fix confusing memory reports from free(1) and
+                * other side-effects, like CommitLimit going negative.
                 */
-               if (hstate_is_gigantic(h))
-                       adjust_managed_page_count(page, pages_per_huge_page(h));
+               adjust_managed_page_count(page, pages_per_huge_page(h));
                cond_resched();
        }
 }
@@ -2834,6 +3015,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
         * pages in hstate via the proc/sysfs interfaces.
         */
        mutex_lock(&h->resize_lock);
+       flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);
 
        /*
@@ -2943,6 +3125,7 @@ static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
        /* free the pages after dropping lock */
        spin_unlock_irq(&hugetlb_lock);
        update_and_free_pages_bulk(h, &page_list);
+       flush_free_hpage_work(h);
        spin_lock_irq(&hugetlb_lock);
 
        while (count < persistent_huge_pages(h)) {
@@ -3450,6 +3633,7 @@ void __init hugetlb_add_hstate(unsigned int order)
        h->next_nid_to_free = first_memory_node;
        snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB",
                                        huge_page_size(h)/1024);
+       hugetlb_vmemmap_init(h);
 
        parsed_hstate = h;
 }
@@ -3924,6 +4108,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
                                int writable)
 {
        pte_t entry;
+       unsigned int shift = huge_page_shift(hstate_vma(vma));
 
        if (writable) {
                entry = huge_pte_mkwrite(huge_pte_mkdirty(mk_huge_pte(page,
@@ -3934,7 +4119,7 @@ static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page,
        }
        entry = pte_mkyoung(entry);
        entry = pte_mkhuge(entry);
-       entry = arch_make_huge_pte(entry, vma, page, writable);
+       entry = arch_make_huge_pte(entry, shift, vma->vm_flags);
 
        return entry;
 }
@@ -4057,12 +4242,13 @@ again:
                                    is_hugetlb_entry_hwpoisoned(entry))) {
                        swp_entry_t swp_entry = pte_to_swp_entry(entry);
 
-                       if (is_write_migration_entry(swp_entry) && cow) {
+                       if (is_writable_migration_entry(swp_entry) && cow) {
                                /*
                                 * COW mappings require pages in both
                                 * parent and child to be set to read.
                                 */
-                               make_migration_entry_read(&swp_entry);
+                               swp_entry = make_readable_migration_entry(
+                                                       swp_offset(swp_entry));
                                entry = swp_entry_to_pte(swp_entry);
                                set_huge_swap_pte_at(src, addr, src_pte,
                                                     entry, sz);
@@ -4474,7 +4660,9 @@ retry_avoidcopy:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(&range);
 out_release_all:
-       restore_reserve_on_error(h, vma, haddr, new_page);
+       /* No restore in case of successful pagetable update (Break COW) */
+       if (new_page != old_page)
+               restore_reserve_on_error(h, vma, haddr, new_page);
        put_page(new_page);
 out_release_old:
        put_page(old_page);
@@ -4590,7 +4778,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
        pte_t new_pte;
        spinlock_t *ptl;
        unsigned long haddr = address & huge_page_mask(h);
-       bool new_page = false;
+       bool new_page, new_pagecache_page = false;
 
        /*
         * Currently, we are forced to kill the process in the event the
@@ -4613,6 +4801,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
                goto out;
 
 retry:
+       new_page = false;
        page = find_lock_page(mapping, idx);
        if (!page) {
                /* Check for page in userfault range */
@@ -4656,6 +4845,7 @@ retry:
                                        goto retry;
                                goto out;
                        }
+                       new_pagecache_page = true;
                } else {
                        lock_page(page);
                        if (unlikely(anon_vma_prepare(vma))) {
@@ -4740,7 +4930,9 @@ backout:
        spin_unlock(ptl);
 backout_unlocked:
        unlock_page(page);
-       restore_reserve_on_error(h, vma, haddr, page);
+       /* restore reserve for newly allocated pages not in page cache */
+       if (new_page && !new_pagecache_page)
+               restore_reserve_on_error(h, vma, haddr, page);
        put_page(page);
        goto out;
 }
@@ -4939,19 +5131,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                            struct page **pagep)
 {
        bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
-       struct address_space *mapping;
-       pgoff_t idx;
+       struct hstate *h = hstate_vma(dst_vma);
+       struct address_space *mapping = dst_vma->vm_file->f_mapping;
+       pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
        unsigned long size;
        int vm_shared = dst_vma->vm_flags & VM_SHARED;
-       struct hstate *h = hstate_vma(dst_vma);
        pte_t _dst_pte;
        spinlock_t *ptl;
-       int ret;
+       int ret = -ENOMEM;
        struct page *page;
        int writable;
-
-       mapping = dst_vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, dst_vma, dst_addr);
+       bool new_pagecache_page = false;
 
        if (is_continue) {
                ret = -EFAULT;
@@ -4981,12 +5171,44 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                /* fallback to copy_from_user outside mmap_lock */
                if (unlikely(ret)) {
                        ret = -ENOENT;
+                       /* Free the allocated page which may have
+                        * consumed a reservation.
+                        */
+                       restore_reserve_on_error(h, dst_vma, dst_addr, page);
+                       put_page(page);
+
+                       /* Allocate a temporary page to hold the copied
+                        * contents.
+                        */
+                       page = alloc_huge_page_vma(h, dst_vma, dst_addr);
+                       if (!page) {
+                               ret = -ENOMEM;
+                               goto out;
+                       }
                        *pagep = page;
-                       /* don't free the page */
+                       /* Set the outparam pagep and return to the caller to
+                        * copy the contents outside the lock. Don't free the
+                        * page.
+                        */
                        goto out;
                }
        } else {
-               page = *pagep;
+               if (vm_shared &&
+                   hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+                       put_page(*pagep);
+                       ret = -EEXIST;
+                       *pagep = NULL;
+                       goto out;
+               }
+
+               page = alloc_huge_page(dst_vma, dst_addr, 0);
+               if (IS_ERR(page)) {
+                       ret = -ENOMEM;
+                       *pagep = NULL;
+                       goto out;
+               }
+               copy_huge_page(page, *pagep);
+               put_page(*pagep);
                *pagep = NULL;
        }
 
@@ -5013,6 +5235,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
                ret = huge_add_to_page_cache(page, mapping, idx);
                if (ret)
                        goto out_release_nounlock;
+               new_pagecache_page = true;
        }
 
        ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
@@ -5076,7 +5299,8 @@ out_release_unlock:
        if (vm_shared || is_continue)
                unlock_page(page);
 out_release_nounlock:
-       restore_reserve_on_error(h, dst_vma, dst_addr, page);
+       if (!new_pagecache_page)
+               restore_reserve_on_error(h, dst_vma, dst_addr, page);
        put_page(page);
        goto out;
 }
@@ -5225,8 +5449,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        continue;
                }
 
-               refs = min3(pages_per_huge_page(h) - pfn_offset,
-                           (vma->vm_end - vaddr) >> PAGE_SHIFT, remainder);
+               /* vaddr may not be aligned to PAGE_SIZE */
+               refs = min3(pages_per_huge_page(h) - pfn_offset, remainder,
+                   (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT);
 
                if (pages || vmas)
                        record_subpages_vmas(mem_map_offset(page, pfn_offset),
@@ -5318,10 +5543,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                if (unlikely(is_hugetlb_entry_migration(pte))) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
 
-                       if (is_write_migration_entry(entry)) {
+                       if (is_writable_migration_entry(entry)) {
                                pte_t newpte;
 
-                               make_migration_entry_read(&entry);
+                               entry = make_readable_migration_entry(
+                                                       swp_offset(entry));
                                newpte = swp_entry_to_pte(entry);
                                set_huge_swap_pte_at(mm, address, ptep,
                                                     newpte, huge_page_size(h));
@@ -5332,10 +5558,11 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
                }
                if (!huge_pte_none(pte)) {
                        pte_t old_pte;
+                       unsigned int shift = huge_page_shift(hstate_vma(vma));
 
                        old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
                        pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
-                       pte = arch_make_huge_pte(pte, vma, NULL, 0);
+                       pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
                        huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
                        pages++;
                }