Merge tag 's390-5.15-1' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
[linux-2.6-microblaze.git] / mm / huge_memory.c
index 6d2a011..afff3ac 100644 (file)
@@ -64,7 +64,14 @@ static atomic_t huge_zero_refcount;
 struct page *huge_zero_page __read_mostly;
 unsigned long huge_zero_pfn __read_mostly = ~0UL;
 
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+       return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+              !inode_is_open_for_write(vma->vm_file->f_inode) &&
+              (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
 {
        /* The addr is used to check if the vma size fits */
        unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -75,6 +82,8 @@ bool transparent_hugepage_enabled(struct vm_area_struct *vma)
                return __transparent_hugepage_enabled(vma);
        if (vma_is_shmem(vma))
                return shmem_huge_enabled(vma);
+       if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+               return file_thp_enabled(vma);
 
        return false;
 }
@@ -1017,7 +1026,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
 
 int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                  pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
-                 struct vm_area_struct *vma)
+                 struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
 {
        spinlock_t *dst_ptl, *src_ptl;
        struct page *src_page;
@@ -1026,7 +1035,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        int ret = -ENOMEM;
 
        /* Skip if can be re-fill on fault */
-       if (!vma_is_anonymous(vma))
+       if (!vma_is_anonymous(dst_vma))
                return 0;
 
        pgtable = pte_alloc_one(dst_mm);
@@ -1040,29 +1049,26 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        ret = -EAGAIN;
        pmd = *src_pmd;
 
-       /*
-        * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
-        * does not have the VM_UFFD_WP, which means that the uffd
-        * fork event is not enabled.
-        */
-       if (!(vma->vm_flags & VM_UFFD_WP))
-               pmd = pmd_clear_uffd_wp(pmd);
-
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
        if (unlikely(is_swap_pmd(pmd))) {
                swp_entry_t entry = pmd_to_swp_entry(pmd);
 
                VM_BUG_ON(!is_pmd_migration_entry(pmd));
-               if (is_write_migration_entry(entry)) {
-                       make_migration_entry_read(&entry);
+               if (is_writable_migration_entry(entry)) {
+                       entry = make_readable_migration_entry(
+                                                       swp_offset(entry));
                        pmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*src_pmd))
                                pmd = pmd_swp_mksoft_dirty(pmd);
+                       if (pmd_swp_uffd_wp(*src_pmd))
+                               pmd = pmd_swp_mkuffd_wp(pmd);
                        set_pmd_at(src_mm, addr, src_pmd, pmd);
                }
                add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
                mm_inc_nr_ptes(dst_mm);
                pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+               if (!userfaultfd_wp(dst_vma))
+                       pmd = pmd_swp_clear_uffd_wp(pmd);
                set_pmd_at(dst_mm, addr, dst_pmd, pmd);
                ret = 0;
                goto out_unlock;
@@ -1079,17 +1085,13 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * a page table.
         */
        if (is_huge_zero_pmd(pmd)) {
-               struct page *zero_page;
                /*
                 * get_huge_zero_page() will never allocate a new page here,
                 * since we already have a zero page to copy. It just takes a
                 * reference.
                 */
-               zero_page = mm_get_huge_zero_page(dst_mm);
-               set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
-                               zero_page);
-               ret = 0;
-               goto out_unlock;
+               mm_get_huge_zero_page(dst_mm);
+               goto out_zero_page;
        }
 
        src_page = pmd_page(pmd);
@@ -1102,21 +1104,23 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * best effort that the pinned pages won't be replaced by another
         * random page during the coming copy-on-write.
         */
-       if (unlikely(page_needs_cow_for_dma(vma, src_page))) {
+       if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) {
                pte_free(dst_mm, pgtable);
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
-               __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+               __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
                return -EAGAIN;
        }
 
        get_page(src_page);
        page_dup_rmap(src_page, true);
        add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
        mm_inc_nr_ptes(dst_mm);
        pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
        pmdp_set_wrprotect(src_mm, addr, src_pmd);
+       if (!userfaultfd_wp(dst_vma))
+               pmd = pmd_clear_uffd_wp(pmd);
        pmd = pmd_mkold(pmd_wrprotect(pmd));
        set_pmd_at(dst_mm, addr, dst_pmd, pmd);
 
@@ -1254,11 +1258,12 @@ unlock:
 }
 #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
 
-void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd)
+void huge_pmd_set_accessed(struct vm_fault *vmf)
 {
        pmd_t entry;
        unsigned long haddr;
        bool write = vmf->flags & FAULT_FLAG_WRITE;
+       pmd_t orig_pmd = vmf->orig_pmd;
 
        vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
        if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1275,11 +1280,12 @@ unlock:
        spin_unlock(vmf->ptl);
 }
 
-vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
+vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
+       pmd_t orig_pmd = vmf->orig_pmd;
 
        vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
        VM_BUG_ON_VMA(!vma->anon_vma, vma);
@@ -1415,96 +1421,25 @@ out:
 }
 
 /* NUMA hinting page fault entry point for trans huge pmds */
-vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
+vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
-       struct anon_vma *anon_vma = NULL;
+       pmd_t oldpmd = vmf->orig_pmd;
+       pmd_t pmd;
        struct page *page;
        unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
-       int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
+       int page_nid = NUMA_NO_NODE;
        int target_nid, last_cpupid = -1;
-       bool page_locked;
        bool migrated = false;
-       bool was_writable;
+       bool was_writable = pmd_savedwrite(oldpmd);
        int flags = 0;
 
        vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
-       if (unlikely(!pmd_same(pmd, *vmf->pmd)))
-               goto out_unlock;
-
-       /*
-        * If there are potential migrations, wait for completion and retry
-        * without disrupting NUMA hinting information. Do not relock and
-        * check_same as the page may no longer be mapped.
-        */
-       if (unlikely(pmd_trans_migrating(*vmf->pmd))) {
-               page = pmd_page(*vmf->pmd);
-               if (!get_page_unless_zero(page))
-                       goto out_unlock;
+       if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
                spin_unlock(vmf->ptl);
-               put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
                goto out;
        }
 
-       page = pmd_page(pmd);
-       BUG_ON(is_huge_zero_page(page));
-       page_nid = page_to_nid(page);
-       last_cpupid = page_cpupid_last(page);
-       count_vm_numa_event(NUMA_HINT_FAULTS);
-       if (page_nid == this_nid) {
-               count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
-               flags |= TNF_FAULT_LOCAL;
-       }
-
-       /* See similar comment in do_numa_page for explanation */
-       if (!pmd_savedwrite(pmd))
-               flags |= TNF_NO_GROUP;
-
-       /*
-        * Acquire the page lock to serialise THP migrations but avoid dropping
-        * page_table_lock if at all possible
-        */
-       page_locked = trylock_page(page);
-       target_nid = mpol_misplaced(page, vma, haddr);
-       /* Migration could have started since the pmd_trans_migrating check */
-       if (!page_locked) {
-               page_nid = NUMA_NO_NODE;
-               if (!get_page_unless_zero(page))
-                       goto out_unlock;
-               spin_unlock(vmf->ptl);
-               put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
-               goto out;
-       } else if (target_nid == NUMA_NO_NODE) {
-               /* There are no parallel migrations and page is in the right
-                * node. Clear the numa hinting info in this pmd.
-                */
-               goto clear_pmdnuma;
-       }
-
-       /*
-        * Page is misplaced. Page lock serialises migrations. Acquire anon_vma
-        * to serialises splits
-        */
-       get_page(page);
-       spin_unlock(vmf->ptl);
-       anon_vma = page_lock_anon_vma_read(page);
-
-       /* Confirm the PMD did not change while page_table_lock was released */
-       spin_lock(vmf->ptl);
-       if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
-               unlock_page(page);
-               put_page(page);
-               page_nid = NUMA_NO_NODE;
-               goto out_unlock;
-       }
-
-       /* Bail if we fail to protect against THP splits for any reason */
-       if (unlikely(!anon_vma)) {
-               put_page(page);
-               page_nid = NUMA_NO_NODE;
-               goto clear_pmdnuma;
-       }
-
        /*
         * Since we took the NUMA fault, we must have observed the !accessible
         * bit. Make sure all other CPUs agree with that, to avoid them
@@ -1531,43 +1466,58 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
                                              haddr + HPAGE_PMD_SIZE);
        }
 
-       /*
-        * Migrate the THP to the requested node, returns with page unlocked
-        * and access rights restored.
-        */
+       pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+       page = vm_normal_page_pmd(vma, haddr, pmd);
+       if (!page)
+               goto out_map;
+
+       /* See similar comment in do_numa_page for explanation */
+       if (!was_writable)
+               flags |= TNF_NO_GROUP;
+
+       page_nid = page_to_nid(page);
+       last_cpupid = page_cpupid_last(page);
+       target_nid = numa_migrate_prep(page, vma, haddr, page_nid,
+                                      &flags);
+
+       if (target_nid == NUMA_NO_NODE) {
+               put_page(page);
+               goto out_map;
+       }
+
        spin_unlock(vmf->ptl);
 
-       migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma,
-                               vmf->pmd, pmd, vmf->address, page, target_nid);
+       migrated = migrate_misplaced_page(page, vma, target_nid);
        if (migrated) {
                flags |= TNF_MIGRATED;
                page_nid = target_nid;
-       } else
+       } else {
                flags |= TNF_MIGRATE_FAIL;
-
-       goto out;
-clear_pmdnuma:
-       BUG_ON(!PageLocked(page));
-       was_writable = pmd_savedwrite(pmd);
-       pmd = pmd_modify(pmd, vma->vm_page_prot);
-       pmd = pmd_mkyoung(pmd);
-       if (was_writable)
-               pmd = pmd_mkwrite(pmd);
-       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
-       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
-       unlock_page(page);
-out_unlock:
-       spin_unlock(vmf->ptl);
+               vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+               if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+                       spin_unlock(vmf->ptl);
+                       goto out;
+               }
+               goto out_map;
+       }
 
 out:
-       if (anon_vma)
-               page_unlock_anon_vma_read(anon_vma);
-
        if (page_nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
                                flags);
 
        return 0;
+
+out_map:
+       /* Restore the PMD */
+       pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+       pmd = pmd_mkyoung(pmd);
+       if (was_writable)
+               pmd = pmd_mkwrite(pmd);
+       set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
+       update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+       spin_unlock(vmf->ptl);
+       goto out;
 }
 
 /*
@@ -1604,7 +1554,7 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
         * If other processes are mapping this page, we couldn't discard
         * the page unless they all do MADV_FREE so let's skip the page.
         */
-       if (page_mapcount(page) != 1)
+       if (total_mapcount(page) != 1)
                goto out;
 
        if (!trylock_page(page))
@@ -1677,12 +1627,9 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
                if (arch_needs_pgtable_deposit())
                        zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
-               if (is_huge_zero_pmd(orig_pmd))
-                       tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
        } else if (is_huge_zero_pmd(orig_pmd)) {
                zap_deposited_table(tlb->mm, pmd);
                spin_unlock(ptl);
-               tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE);
        } else {
                struct page *page = NULL;
                int flush_needed = 1;
@@ -1697,7 +1644,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
 
                        VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
                        entry = pmd_to_swp_entry(orig_pmd);
-                       page = migration_entry_to_page(entry);
+                       page = pfn_swap_entry_to_page(entry);
                        flush_needed = 0;
                } else
                        WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -1796,6 +1743,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
  * Returns
  *  - 0 if PMD could not be locked
  *  - 1 if PMD was locked but protections unchanged and TLB flush unnecessary
+ *      or if prot_numa but THP migration is not supported
  *  - HPAGE_PMD_NR if protections changed and TLB flush necessary
  */
 int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
@@ -1810,6 +1758,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
        bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
        bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
 
+       if (prot_numa && !thp_migration_supported())
+               return 1;
+
        ptl = __pmd_trans_huge_lock(pmd, vma);
        if (!ptl)
                return 0;
@@ -1822,16 +1773,19 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
                swp_entry_t entry = pmd_to_swp_entry(*pmd);
 
                VM_BUG_ON(!is_pmd_migration_entry(*pmd));
-               if (is_write_migration_entry(entry)) {
+               if (is_writable_migration_entry(entry)) {
                        pmd_t newpmd;
                        /*
                         * A protection check is difficult so
                         * just be safe and disable write
                         */
-                       make_migration_entry_read(&entry);
+                       entry = make_readable_migration_entry(
+                                                       swp_offset(entry));
                        newpmd = swp_entry_to_pmd(entry);
                        if (pmd_swp_soft_dirty(*pmd))
                                newpmd = pmd_swp_mksoft_dirty(newpmd);
+                       if (pmd_swp_uffd_wp(*pmd))
+                               newpmd = pmd_swp_mkuffd_wp(newpmd);
                        set_pmd_at(mm, addr, pmd, newpmd);
                }
                goto unlock;
@@ -2060,7 +2014,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                        swp_entry_t entry;
 
                        entry = pmd_to_swp_entry(old_pmd);
-                       page = migration_entry_to_page(entry);
+                       page = pfn_swap_entry_to_page(entry);
                } else {
                        page = pmd_page(old_pmd);
                        if (!PageDirty(page) && pmd_dirty(old_pmd))
@@ -2114,8 +2068,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                swp_entry_t entry;
 
                entry = pmd_to_swp_entry(old_pmd);
-               page = migration_entry_to_page(entry);
-               write = is_write_migration_entry(entry);
+               page = pfn_swap_entry_to_page(entry);
+               write = is_writable_migration_entry(entry);
                young = false;
                soft_dirty = pmd_swp_soft_dirty(old_pmd);
                uffd_wp = pmd_swp_uffd_wp(old_pmd);
@@ -2147,7 +2101,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
                 */
                if (freeze || pmd_migration) {
                        swp_entry_t swp_entry;
-                       swp_entry = make_migration_entry(page + i, write);
+                       if (write)
+                               swp_entry = make_writable_migration_entry(
+                                                       page_to_pfn(page + i));
+                       else
+                               swp_entry = make_readable_migration_entry(
+                                                       page_to_pfn(page + i));
                        entry = swp_entry_to_pte(swp_entry);
                        if (soft_dirty)
                                entry = pte_swp_mksoft_dirty(entry);
@@ -2350,15 +2309,20 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
 
 static void unmap_page(struct page *page)
 {
-       enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
-               TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
+       enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD |
+               TTU_SYNC;
 
        VM_BUG_ON_PAGE(!PageHead(page), page);
 
+       /*
+        * Anon pages need migration entries to preserve them, but file
+        * pages can simply be left unmapped, then faulted back on demand.
+        * If that is ever changed (perhaps for mlock), update remap_page().
+        */
        if (PageAnon(page))
-               ttu_flags |= TTU_SPLIT_FREEZE;
-
-       try_to_unmap(page, ttu_flags);
+               try_to_migrate(page, ttu_flags);
+       else
+               try_to_unmap(page, ttu_flags | TTU_IGNORE_MLOCK);
 
        VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
 }
@@ -2366,6 +2330,10 @@ static void unmap_page(struct page *page)
 static void remap_page(struct page *page, unsigned int nr)
 {
        int i;
+
+       /* If unmap_page() uses try_to_migrate() on file, remove this check */
+       if (!PageAnon(page))
+               return;
        if (PageTransHuge(page)) {
                remove_migration_ptes(page, page, true);
        } else {
@@ -2870,7 +2838,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
        /* Take pin on all head pages to avoid freeing them under us */
        list_for_each_safe(pos, next, &ds_queue->split_queue) {
-               page = list_entry((void *)pos, struct page, mapping);
+               page = list_entry((void *)pos, struct page, deferred_list);
                page = compound_head(page);
                if (get_page_unless_zero(page)) {
                        list_move(page_deferred_list(page), &list);
@@ -2885,7 +2853,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
        spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
 
        list_for_each_safe(pos, next, &list) {
-               page = list_entry((void *)pos, struct page, mapping);
+               page = list_entry((void *)pos, struct page, deferred_list);
                if (!trylock_page(page))
                        goto next;
                /* split_huge_page() removes page from list on success */
@@ -3144,7 +3112,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf,
 
                tok = strsep(&buf, ",");
                if (tok) {
-                       strncpy(file_path, tok, MAX_INPUT_BUF_SZ);
+                       strcpy(file_path, tok);
                } else {
                        ret = -EINVAL;
                        goto out;
@@ -3214,7 +3182,10 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw,
        pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
        if (pmd_dirty(pmdval))
                set_page_dirty(page);
-       entry = make_migration_entry(page, pmd_write(pmdval));
+       if (pmd_write(pmdval))
+               entry = make_writable_migration_entry(page_to_pfn(page));
+       else
+               entry = make_readable_migration_entry(page_to_pfn(page));
        pmdswp = swp_entry_to_pmd(entry);
        if (pmd_soft_dirty(pmdval))
                pmdswp = pmd_swp_mksoft_dirty(pmdswp);
@@ -3240,8 +3211,10 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new)
        pmde = pmd_mkold(mk_huge_pmd(new, vma->vm_page_prot));
        if (pmd_swp_soft_dirty(*pvmw->pmd))
                pmde = pmd_mksoft_dirty(pmde);
-       if (is_write_migration_entry(entry))
+       if (is_writable_migration_entry(entry))
                pmde = maybe_pmd_mkwrite(pmde, vma);
+       if (pmd_swp_uffd_wp(*pvmw->pmd))
+               pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
 
        flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
        if (PageAnon(new))