ARM: 8985/1: efi/decompressor: deal with HYP mode boot gracefully
[linux-2.6-microblaze.git] / mm / huge_memory.c
index 11fe0b4..78c84be 100644 (file)
@@ -522,7 +522,7 @@ void prep_transhuge_page(struct page *page)
 bool is_transparent_hugepage(struct page *page)
 {
        if (!PageCompound(page))
-               return 0;
+               return false;
 
        page = compound_head(page);
        return is_huge_zero_page(page) ||
@@ -587,19 +587,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                        struct page *page, gfp_t gfp)
 {
        struct vm_area_struct *vma = vmf->vma;
-       struct mem_cgroup *memcg;
        pgtable_t pgtable;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
        vm_fault_t ret = 0;
 
        VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-       if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+       if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
                put_page(page);
                count_vm_event(THP_FAULT_FALLBACK);
                count_vm_event(THP_FAULT_FALLBACK_CHARGE);
                return VM_FAULT_FALLBACK;
        }
+       cgroup_throttle_swaprate(page, gfp);
 
        pgtable = pte_alloc_one(vma->vm_mm);
        if (unlikely(!pgtable)) {
@@ -630,7 +630,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                        vm_fault_t ret2;
 
                        spin_unlock(vmf->ptl);
-                       mem_cgroup_cancel_charge(page, memcg, true);
                        put_page(page);
                        pte_free(vma->vm_mm, pgtable);
                        ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -641,7 +640,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                entry = mk_huge_pmd(page, vma->vm_page_prot);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
                page_add_new_anon_rmap(page, vma, haddr, true);
-               mem_cgroup_commit_charge(page, memcg, false, true);
                lru_cache_add_active_or_unevictable(page, vma);
                pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
                set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
@@ -649,7 +647,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
                mm_inc_nr_ptes(vma->vm_mm);
                spin_unlock(vmf->ptl);
                count_vm_event(THP_FAULT_ALLOC);
-               count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
+               count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
        }
 
        return 0;
@@ -658,7 +656,6 @@ unlock_release:
 release:
        if (pgtable)
                pte_free(vma->vm_mm, pgtable);
-       mem_cgroup_cancel_charge(page, memcg, true);
        put_page(page);
        return ret;
 
@@ -1255,263 +1252,63 @@ unlock:
        spin_unlock(vmf->ptl);
 }
 
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
-                       pmd_t orig_pmd, struct page *page)
-{
-       struct vm_area_struct *vma = vmf->vma;
-       unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       struct mem_cgroup *memcg;
-       pgtable_t pgtable;
-       pmd_t _pmd;
-       int i;
-       vm_fault_t ret = 0;
-       struct page **pages;
-       struct mmu_notifier_range range;
-
-       pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
-                             GFP_KERNEL);
-       if (unlikely(!pages)) {
-               ret |= VM_FAULT_OOM;
-               goto out;
-       }
-
-       for (i = 0; i < HPAGE_PMD_NR; i++) {
-               pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
-                                              vmf->address, page_to_nid(page));
-               if (unlikely(!pages[i] ||
-                            mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
-                                    GFP_KERNEL, &memcg, false))) {
-                       if (pages[i])
-                               put_page(pages[i]);
-                       while (--i >= 0) {
-                               memcg = (void *)page_private(pages[i]);
-                               set_page_private(pages[i], 0);
-                               mem_cgroup_cancel_charge(pages[i], memcg,
-                                               false);
-                               put_page(pages[i]);
-                       }
-                       kfree(pages);
-                       ret |= VM_FAULT_OOM;
-                       goto out;
-               }
-               set_page_private(pages[i], (unsigned long)memcg);
-       }
-
-       for (i = 0; i < HPAGE_PMD_NR; i++) {
-               copy_user_highpage(pages[i], page + i,
-                                  haddr + PAGE_SIZE * i, vma);
-               __SetPageUptodate(pages[i]);
-               cond_resched();
-       }
-
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
-       mmu_notifier_invalidate_range_start(&range);
-
-       vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-               goto out_free_pages;
-       VM_BUG_ON_PAGE(!PageHead(page), page);
-
-       /*
-        * Leave pmd empty until pte is filled note we must notify here as
-        * concurrent CPU thread might write to new page before the call to
-        * mmu_notifier_invalidate_range_end() happens which can lead to a
-        * device seeing memory write in different order than CPU.
-        *
-        * See Documentation/vm/mmu_notifier.rst
-        */
-       pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
-       pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
-       pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
-       for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
-               pte_t entry;
-               entry = mk_pte(pages[i], vma->vm_page_prot);
-               entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               memcg = (void *)page_private(pages[i]);
-               set_page_private(pages[i], 0);
-               page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
-               mem_cgroup_commit_charge(pages[i], memcg, false, false);
-               lru_cache_add_active_or_unevictable(pages[i], vma);
-               vmf->pte = pte_offset_map(&_pmd, haddr);
-               VM_BUG_ON(!pte_none(*vmf->pte));
-               set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
-               pte_unmap(vmf->pte);
-       }
-       kfree(pages);
-
-       smp_wmb(); /* make pte visible before pmd */
-       pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
-       page_remove_rmap(page, true);
-       spin_unlock(vmf->ptl);
-
-       /*
-        * No need to double call mmu_notifier->invalidate_range() callback as
-        * the above pmdp_huge_clear_flush_notify() did already call it.
-        */
-       mmu_notifier_invalidate_range_only_end(&range);
-
-       ret |= VM_FAULT_WRITE;
-       put_page(page);
-
-out:
-       return ret;
-
-out_free_pages:
-       spin_unlock(vmf->ptl);
-       mmu_notifier_invalidate_range_end(&range);
-       for (i = 0; i < HPAGE_PMD_NR; i++) {
-               memcg = (void *)page_private(pages[i]);
-               set_page_private(pages[i], 0);
-               mem_cgroup_cancel_charge(pages[i], memcg, false);
-               put_page(pages[i]);
-       }
-       kfree(pages);
-       goto out;
-}
-
 vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
 {
        struct vm_area_struct *vma = vmf->vma;
-       struct page *page = NULL, *new_page;
-       struct mem_cgroup *memcg;
+       struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       struct mmu_notifier_range range;
-       gfp_t huge_gfp;                 /* for allocation and charge */
-       vm_fault_t ret = 0;
 
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
        if (is_huge_zero_pmd(orig_pmd))
-               goto alloc;
+               goto fallback;
+
        spin_lock(vmf->ptl);
-       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
-               goto out_unlock;
+
+       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+               spin_unlock(vmf->ptl);
+               return 0;
+       }
 
        page = pmd_page(orig_pmd);
        VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
-       /*
-        * We can only reuse the page if nobody else maps the huge page or it's
-        * part.
-        */
+
+       /* Lock page for reuse_swap_page() */
        if (!trylock_page(page)) {
                get_page(page);
                spin_unlock(vmf->ptl);
                lock_page(page);
                spin_lock(vmf->ptl);
                if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+                       spin_unlock(vmf->ptl);
                        unlock_page(page);
                        put_page(page);
-                       goto out_unlock;
+                       return 0;
                }
                put_page(page);
        }
+
+       /*
+        * We can only reuse the page if nobody else maps the huge page or it's
+        * part.
+        */
        if (reuse_swap_page(page, NULL)) {
                pmd_t entry;
                entry = pmd_mkyoung(orig_pmd);
                entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry,  1))
+               if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
                        update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-               ret |= VM_FAULT_WRITE;
                unlock_page(page);
-               goto out_unlock;
-       }
-       unlock_page(page);
-       get_page(page);
-       spin_unlock(vmf->ptl);
-alloc:
-       if (__transparent_hugepage_enabled(vma) &&
-           !transparent_hugepage_debug_cow()) {
-               huge_gfp = alloc_hugepage_direct_gfpmask(vma);
-               new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
-       } else
-               new_page = NULL;
-
-       if (likely(new_page)) {
-               prep_transhuge_page(new_page);
-       } else {
-               if (!page) {
-                       split_huge_pmd(vma, vmf->pmd, vmf->address);
-                       ret |= VM_FAULT_FALLBACK;
-               } else {
-                       ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
-                       if (ret & VM_FAULT_OOM) {
-                               split_huge_pmd(vma, vmf->pmd, vmf->address);
-                               ret |= VM_FAULT_FALLBACK;
-                       }
-                       put_page(page);
-               }
-               count_vm_event(THP_FAULT_FALLBACK);
-               goto out;
-       }
-
-       if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
-                                       huge_gfp, &memcg, true))) {
-               put_page(new_page);
-               split_huge_pmd(vma, vmf->pmd, vmf->address);
-               if (page)
-                       put_page(page);
-               ret |= VM_FAULT_FALLBACK;
-               count_vm_event(THP_FAULT_FALLBACK);
-               count_vm_event(THP_FAULT_FALLBACK_CHARGE);
-               goto out;
-       }
-
-       count_vm_event(THP_FAULT_ALLOC);
-       count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
-       if (!page)
-               clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
-       else
-               copy_user_huge_page(new_page, page, vmf->address,
-                                   vma, HPAGE_PMD_NR);
-       __SetPageUptodate(new_page);
-
-       mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
-                               haddr, haddr + HPAGE_PMD_SIZE);
-       mmu_notifier_invalidate_range_start(&range);
-
-       spin_lock(vmf->ptl);
-       if (page)
-               put_page(page);
-       if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
                spin_unlock(vmf->ptl);
-               mem_cgroup_cancel_charge(new_page, memcg, true);
-               put_page(new_page);
-               goto out_mn;
-       } else {
-               pmd_t entry;
-               entry = mk_huge_pmd(new_page, vma->vm_page_prot);
-               entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
-               pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-               page_add_new_anon_rmap(new_page, vma, haddr, true);
-               mem_cgroup_commit_charge(new_page, memcg, false, true);
-               lru_cache_add_active_or_unevictable(new_page, vma);
-               set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
-               update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-               if (!page) {
-                       add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
-               } else {
-                       VM_BUG_ON_PAGE(!PageHead(page), page);
-                       page_remove_rmap(page, true);
-                       put_page(page);
-               }
-               ret |= VM_FAULT_WRITE;
+               return VM_FAULT_WRITE;
        }
+
+       unlock_page(page);
        spin_unlock(vmf->ptl);
-out_mn:
-       /*
-        * No need to double call mmu_notifier->invalidate_range() callback as
-        * the above pmdp_huge_clear_flush_notify() did already call it.
-        */
-       mmu_notifier_invalidate_range_only_end(&range);
-out:
-       return ret;
-out_unlock:
-       spin_unlock(vmf->ptl);
-       return ret;
+fallback:
+       __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+       return VM_FAULT_FALLBACK;
 }
 
 /*
@@ -1581,7 +1378,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
                        goto skip_mlock;
                if (!trylock_page(page))
                        goto skip_mlock;
-               lru_add_drain();
                if (page->mapping && !PageDoubleMap(page))
                        mlock_vma_page(page);
                unlock_page(page);
@@ -1851,8 +1647,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
         * pgtable_trans_huge_withdraw after finishing pmdp related
         * operations.
         */
-       orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
-                       tlb->fullmm);
+       orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+                                               tlb->fullmm);
        tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
        if (vma_is_special_huge(vma)) {
                if (arch_needs_pgtable_deposit())
@@ -1950,7 +1746,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
 
        /*
         * We don't have to worry about the ordering of src and dst
-        * ptlocks because exclusive mmap_sem prevents deadlock.
+        * ptlocks because exclusive mmap_lock prevents deadlock.
         */
        old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
        if (old_ptl) {
@@ -2037,9 +1833,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                goto unlock;
 
        /*
-        * In case prot_numa, we are under down_read(mmap_sem). It's critical
+        * In case prot_numa, we are under mmap_read_lock(mm). It's critical
         * to not clear pmd intermittently to avoid race with MADV_DONTNEED
-        * which is also under down_read(mmap_sem):
+        * which is also under mmap_read_lock(mm):
         *
         *      CPU0:                           CPU1:
         *                              change_huge_pmd(prot_numa=1)
@@ -2359,15 +2155,17 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        atomic_inc(&page[i]._mapcount);
        }
 
+       lock_page_memcg(page);
        if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
                /* Last compound_mapcount is gone. */
-               __dec_node_page_state(page, NR_ANON_THPS);
+               __dec_lruvec_page_state(page, NR_ANON_THPS);
                if (TestClearPageDoubleMap(page)) {
                        /* No need in mapcount reference anymore */
                        for (i = 0; i < HPAGE_PMD_NR; i++)
                                atomic_dec(&page[i]._mapcount);
                }
        }
+       unlock_page_memcg(page);
 
        smp_wmb(); /* make pte visible before pmd */
        pmd_populate(mm, pmd, pgtable);
@@ -2385,6 +2183,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
 {
        spinlock_t *ptl;
        struct mmu_notifier_range range;
+       bool was_locked = false;
+       pmd_t _pmd;
 
        mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
                                address & HPAGE_PMD_MASK,
@@ -2397,11 +2197,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
         * pmd against. Otherwise we can end up replacing wrong page.
         */
        VM_BUG_ON(freeze && !page);
-       if (page && page != pmd_page(*pmd))
-               goto out;
+       if (page) {
+               VM_WARN_ON_ONCE(!PageLocked(page));
+               was_locked = true;
+               if (page != pmd_page(*pmd))
+                       goto out;
+       }
 
+repeat:
        if (pmd_trans_huge(*pmd)) {
-               page = pmd_page(*pmd);
+               if (!page) {
+                       page = pmd_page(*pmd);
+                       if (unlikely(!trylock_page(page))) {
+                               get_page(page);
+                               _pmd = *pmd;
+                               spin_unlock(ptl);
+                               lock_page(page);
+                               spin_lock(ptl);
+                               if (unlikely(!pmd_same(*pmd, _pmd))) {
+                                       unlock_page(page);
+                                       put_page(page);
+                                       page = NULL;
+                                       goto repeat;
+                               }
+                               put_page(page);
+                       }
+               }
                if (PageMlocked(page))
                        clear_page_mlock(page);
        } else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
@@ -2409,6 +2230,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        __split_huge_pmd_locked(vma, pmd, range.start, freeze);
 out:
        spin_unlock(ptl);
+       if (!was_locked && page)
+               unlock_page(page);
        /*
         * No need to double call mmu_notifier->invalidate_range() callback.
         * They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2783,7 +2606,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
        struct anon_vma *anon_vma = NULL;
        struct address_space *mapping = NULL;
        int count, mapcount, extra_pins, ret;
-       bool mlocked;
        unsigned long flags;
        pgoff_t end;
 
@@ -2796,7 +2618,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
 
        if (PageAnon(head)) {
                /*
-                * The caller does not necessarily hold an mmap_sem that would
+                * The caller does not necessarily hold an mmap_lock that would
                 * prevent the anon_vma disappearing so we first we take a
                 * reference to it and then lock the anon_vma for write. This
                 * is similar to page_lock_anon_vma_read except the write lock
@@ -2842,14 +2664,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
                goto out_unlock;
        }
 
-       mlocked = PageMlocked(head);
        unmap_page(head);
        VM_BUG_ON_PAGE(compound_mapcount(head), head);
 
-       /* Make sure the page is not on per-CPU pagevec as it takes pin */
-       if (mlocked)
-               lru_add_drain();
-
        /* prevent PageLRU to go away from under us, and freeze lru stats */
        spin_lock_irqsave(&pgdata->lru_lock, flags);