drm/i915: Fix fall-through warning for Clang
[linux-2.6-microblaze.git] / mm / memory.c
index 550405f..747a01d 100644 (file)
@@ -90,8 +90,7 @@
 #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
 #endif
 
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-/* use the per-pgdat data instead for discontigmem - mbligh */
+#ifndef CONFIG_NUMA
 unsigned long max_mapnr;
 EXPORT_SYMBOL(max_mapnr);
 
@@ -700,6 +699,68 @@ out:
 }
 #endif
 
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+                                 struct page *page, unsigned long address,
+                                 pte_t *ptep)
+{
+       pte_t pte;
+       swp_entry_t entry;
+
+       pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+       if (pte_swp_soft_dirty(*ptep))
+               pte = pte_mksoft_dirty(pte);
+
+       entry = pte_to_swp_entry(*ptep);
+       if (pte_swp_uffd_wp(*ptep))
+               pte = pte_mkuffd_wp(pte);
+       else if (is_writable_device_exclusive_entry(entry))
+               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+       set_pte_at(vma->vm_mm, address, ptep, pte);
+
+       /*
+        * No need to take a page reference as one was already
+        * created when the swap entry was made.
+        */
+       if (PageAnon(page))
+               page_add_anon_rmap(page, vma, address, false);
+       else
+               /*
+                * Currently device exclusive access only supports anonymous
+                * memory so the entry shouldn't point to a filebacked page.
+                */
+               WARN_ON_ONCE(!PageAnon(page));
+
+       if (vma->vm_flags & VM_LOCKED)
+               mlock_vma_page(page);
+
+       /*
+        * No need to invalidate - it was non-present before. However
+        * secondary CPUs may have mappings that need invalidating.
+        */
+       update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+                       unsigned long addr)
+{
+       swp_entry_t entry = pte_to_swp_entry(*src_pte);
+       struct page *page = pfn_swap_entry_to_page(entry);
+
+       if (trylock_page(page)) {
+               restore_exclusive_pte(vma, page, addr, src_pte);
+               unlock_page(page);
+               return 0;
+       }
+
+       return -EBUSY;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -708,17 +769,17 @@ out:
 
 static unsigned long
 copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-               unsigned long addr, int *rss)
+               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+               struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 {
-       unsigned long vm_flags = vma->vm_flags;
+       unsigned long vm_flags = dst_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(pte);
 
        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
-                       return entry.val;
+                       return -EIO;
 
                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -730,17 +791,18 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
-               page = migration_entry_to_page(entry);
+               page = pfn_swap_entry_to_page(entry);
 
                rss[mm_counter(page)]++;
 
-               if (is_write_migration_entry(entry) &&
+               if (is_writable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both
                         * parent and child to be set to read.
                         */
-                       make_migration_entry_read(&entry);
+                       entry = make_readable_migration_entry(
+                                                       swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(*src_pte))
                                pte = pte_swp_mksoft_dirty(pte);
@@ -749,7 +811,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
-               page = device_private_entry_to_page(entry);
+               page = pfn_swap_entry_to_page(entry);
 
                /*
                 * Update rss count even for unaddressable pages, as
@@ -771,15 +833,29 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
-               if (is_write_device_private_entry(entry) &&
+               if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
-                       make_device_private_entry_read(&entry);
+                       entry = make_readable_device_private_entry(
+                                                       swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
+       } else if (is_device_exclusive_entry(entry)) {
+               /*
+                * Make device exclusive entries present by restoring the
+                * original entry then copying as for a present pte. Device
+                * exclusive entries currently only support private writable
+                * (ie. COW) mappings.
+                */
+               VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+               if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+                       return -EBUSY;
+               return -ENOENT;
        }
+       if (!userfaultfd_wp(dst_vma))
+               pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
 }
@@ -845,6 +921,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
        /* All done, just insert the new page copy in the child */
        pte = mk_pte(new_page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+       if (userfaultfd_pte_wp(dst_vma, *src_pte))
+               /* Uffd-wp needs to be delivered to dest pte as well */
+               pte = pte_wrprotect(pte_mkuffd_wp(pte));
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
 }
@@ -894,12 +973,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);
 
-       /*
-        * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
-        * does not have the VM_UFFD_WP, which means that the uffd
-        * fork event is not enabled.
-        */
-       if (!(vm_flags & VM_UFFD_WP))
+       if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);
 
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -972,13 +1046,25 @@ again:
                        continue;
                }
                if (unlikely(!pte_present(*src_pte))) {
-                       entry.val = copy_nonpresent_pte(dst_mm, src_mm,
-                                                       dst_pte, src_pte,
-                                                       src_vma, addr, rss);
-                       if (entry.val)
+                       ret = copy_nonpresent_pte(dst_mm, src_mm,
+                                                 dst_pte, src_pte,
+                                                 dst_vma, src_vma,
+                                                 addr, rss);
+                       if (ret == -EIO) {
+                               entry = pte_to_swp_entry(*src_pte);
                                break;
-                       progress += 8;
-                       continue;
+                       } else if (ret == -EBUSY) {
+                               break;
+                       } else if (!ret) {
+                               progress += 8;
+                               continue;
+                       }
+
+                       /*
+                        * Device exclusive entry restored, continue by copying
+                        * the now present pte.
+                        */
+                       WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1009,20 +1095,26 @@ again:
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
 
-       if (entry.val) {
+       if (ret == -EIO) {
+               VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
-       } else if (ret) {
-               WARN_ON_ONCE(ret != -EAGAIN);
+       } else if (ret == -EBUSY) {
+               goto out;
+       } else if (ret ==  -EAGAIN) {
                prealloc = page_copy_prealloc(src_mm, src_vma, addr);
                if (!prealloc)
                        return -ENOMEM;
-               /* We've captured and resolved the error. Reset, try again. */
-               ret = 0;
+       } else if (ret) {
+               VM_WARN_ON_ONCE(1);
        }
+
+       /* We've captured and resolved the error. Reset, try again. */
+       ret = 0;
+
        if (addr != end)
                goto again;
 out:
@@ -1051,8 +1143,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
-                       err = copy_huge_pmd(dst_mm, src_mm,
-                                           dst_pmd, src_pmd, addr, src_vma);
+                       err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+                                           addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
@@ -1279,8 +1371,9 @@ again:
                }
 
                entry = pte_to_swp_entry(ptent);
-               if (is_device_private_entry(entry)) {
-                       struct page *page = device_private_entry_to_page(entry);
+               if (is_device_private_entry(entry) ||
+                   is_device_exclusive_entry(entry)) {
+                       struct page *page = pfn_swap_entry_to_page(entry);
 
                        if (unlikely(details && details->check_mapping)) {
                                /*
@@ -1295,7 +1388,10 @@ again:
 
                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
-                       page_remove_rmap(page, false);
+
+                       if (is_device_private_entry(entry))
+                               page_remove_rmap(page, false);
+
                        put_page(page);
                        continue;
                }
@@ -1309,7 +1405,7 @@ again:
                else if (is_migration_entry(entry)) {
                        struct page *page;
 
-                       page = migration_entry_to_page(entry);
+                       page = pfn_swap_entry_to_page(entry);
                        rss[mm_counter(page)]--;
                }
                if (unlikely(!free_swap_and_cache(entry)))
@@ -1361,7 +1457,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                        else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
+               } else if (details && details->single_page &&
+                          PageTransCompound(details->single_page) &&
+                          next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+                       spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+                       /*
+                        * Take and drop THP pmd lock so that we cannot return
+                        * prematurely, while zap_huge_pmd() has cleared *pmd,
+                        * but not yet decremented compound_mapcount().
+                        */
+                       spin_unlock(ptl);
                }
+
                /*
                 * Here there can be other concurrent MADV_DONTNEED or
                 * trans huge page faults running, and if the pmd is
@@ -2260,26 +2367,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
        return 0;
 }
 
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
+ * must have pre-validated the caching bits of the pgprot_t.
  */
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-                   unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+               unsigned long pfn, unsigned long size, pgprot_t prot)
 {
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
-       unsigned long remap_pfn = pfn;
        int err;
 
        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2309,10 +2407,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                vma->vm_pgoff = pfn;
        }
 
-       err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
-       if (err)
-               return -EINVAL;
-
        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
        BUG_ON(addr >= end);
@@ -2324,12 +2418,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
-                       break;
+                       return err;
        } while (pgd++, addr = next, addr != end);
 
+       return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+                   unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+       int err;
+
+       err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
-               untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+               return -EINVAL;
 
+       err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+       if (err)
+               untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -2446,13 +2564,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
        }
        do {
                next = pmd_addr_end(addr, end);
-               if (create || !pmd_none_or_clear_bad(pmd)) {
-                       err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (pmd_none(*pmd) && !create)
+                       continue;
+               if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+                       return -EINVAL;
+               if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+                       if (!create)
+                               continue;
+                       pmd_clear_bad(pmd);
                }
+               err = apply_to_pte_range(mm, pmd, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (pmd++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2474,13 +2600,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
        }
        do {
                next = pud_addr_end(addr, end);
-               if (create || !pud_none_or_clear_bad(pud)) {
-                       err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (pud_none(*pud) && !create)
+                       continue;
+               if (WARN_ON_ONCE(pud_leaf(*pud)))
+                       return -EINVAL;
+               if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+                       if (!create)
+                               continue;
+                       pud_clear_bad(pud);
                }
+               err = apply_to_pmd_range(mm, pud, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (pud++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2502,13 +2636,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
        }
        do {
                next = p4d_addr_end(addr, end);
-               if (create || !p4d_none_or_clear_bad(p4d)) {
-                       err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (p4d_none(*p4d) && !create)
+                       continue;
+               if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+                       return -EINVAL;
+               if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+                       if (!create)
+                               continue;
+                       p4d_clear_bad(p4d);
                }
+               err = apply_to_pud_range(mm, p4d, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (p4d++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2528,9 +2670,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-               if (!create && pgd_none_or_clear_bad(pgd))
+               if (pgd_none(*pgd) && !create)
                        continue;
-               err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+               if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+                       return -EINVAL;
+               if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+                       if (!create)
+                               continue;
+                       pgd_clear_bad(pgd);
+               }
+               err = apply_to_p4d_range(mm, pgd, addr, next,
+                                        fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
@@ -2896,6 +3046,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                }
                flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
+               entry = pte_sw_mkyoung(entry);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
                /*
@@ -2968,6 +3119,8 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
                                munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
+               if (page_copied)
+                       free_swap_cache(old_page);
                put_page(old_page);
        }
        return page_copied ? VM_FAULT_WRITE : 0;
@@ -2992,7 +3145,7 @@ oom:
  * The function expects the page to be locked or other protection against
  * concurrent faults / writeback (such as DAX radix tree locks).
  *
- * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * Return: %0 on success, %VM_FAULT_NOPAGE when PTE got changed before
  * we acquired PTE lock.
  */
 vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
@@ -3192,6 +3345,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
        }
 }
 
+/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range().  But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+       struct address_space *mapping = page->mapping;
+       struct zap_details details = { };
+
+       VM_BUG_ON(!PageLocked(page));
+       VM_BUG_ON(PageTail(page));
+
+       details.check_mapping = mapping;
+       details.first_index = page->index;
+       details.last_index = page->index + thp_nr_pages(page) - 1;
+       details.single_page = page;
+
+       i_mmap_lock_write(mapping);
+       if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+               unmap_mapping_range_tree(&mapping->i_mmap, &details);
+       i_mmap_unlock_write(mapping);
+}
+
 /**
  * unmap_mapping_pages() - Unmap pages from processes.
  * @mapping: The address space containing pages to be unmapped.
@@ -3256,6 +3439,34 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
+/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+       struct page *page = vmf->page;
+       struct vm_area_struct *vma = vmf->vma;
+       struct mmu_notifier_range range;
+
+       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+               return VM_FAULT_RETRY;
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+                               vma->vm_mm, vmf->address & PAGE_MASK,
+                               (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+       mmu_notifier_invalidate_range_start(&range);
+
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                               &vmf->ptl);
+       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+               restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       unlock_page(page);
+
+       mmu_notifier_invalidate_range_end(&range);
+       return 0;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3268,6 +3479,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 {
        struct vm_area_struct *vma = vmf->vma;
        struct page *page = NULL, *swapcache;
+       struct swap_info_struct *si = NULL;
        swp_entry_t entry;
        pte_t pte;
        int locked;
@@ -3283,8 +3495,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
+               } else if (is_device_exclusive_entry(entry)) {
+                       vmf->page = pfn_swap_entry_to_page(entry);
+                       ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
-                       vmf->page = device_private_entry_to_page(entry);
+                       vmf->page = pfn_swap_entry_to_page(entry);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
@@ -3295,42 +3510,42 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                goto out;
        }
 
+       /* Prevent swapoff from happening to us. */
+       si = get_swap_device(entry);
+       if (unlikely(!si))
+               goto out;
 
-       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;
 
        if (!page) {
-               struct swap_info_struct *si = swp_swap_info(entry);
-
                if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
                    __swap_count(entry) == 1) {
                        /* skip swapcache */
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                                        vmf->address);
                        if (page) {
-                               int err;
-
                                __SetPageLocked(page);
                                __SetPageSwapBacked(page);
-                               set_page_private(page, entry.val);
 
-                               /* Tell memcg to use swap ownership records */
-                               SetPageSwapCache(page);
-                               err = mem_cgroup_charge(page, vma->vm_mm,
-                                                       GFP_KERNEL);
-                               ClearPageSwapCache(page);
-                               if (err) {
+                               if (mem_cgroup_swapin_charge_page(page,
+                                       vma->vm_mm, GFP_KERNEL, entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
+                               mem_cgroup_swapin_uncharge_swap(entry);
 
                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(page, shadow);
 
                                lru_cache_add(page);
+
+                               /* To provide entry to swap_readpage() */
+                               set_page_private(page, entry.val);
                                swap_readpage(page, true);
+                               set_page_private(page, 0);
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
@@ -3347,7 +3562,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
-                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }
 
@@ -3361,13 +3576,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                goto out_release;
        }
 
        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
@@ -3473,6 +3688,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 unlock:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
 out:
+       if (si)
+               put_swap_device(si);
        return ret;
 out_nomap:
        pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -3484,6 +3701,8 @@ out_release:
                unlock_page(swapcache);
                put_page(swapcache);
        }
+       if (si)
+               put_swap_device(si);
        return ret;
 }
 
@@ -3561,6 +3780,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
        __SetPageUptodate(page);
 
        entry = mk_pte(page, vma->vm_page_prot);
+       entry = pte_sw_mkyoung(entry);
        if (vma->vm_flags & VM_WRITE)
                entry = pte_mkwrite(pte_mkdirty(entry));
 
@@ -3686,7 +3906,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                return ret;
 
        /*
-        * Archs like ppc64 need additonal space to store information
+        * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -3745,6 +3965,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr)
 
        if (prefault && arch_wants_old_prefaulted_pte())
                entry = pte_mkold(entry);
+       else
+               entry = pte_sw_mkyoung(entry);
 
        if (write)
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3931,9 +4153,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-               ret = do_fault_around(vmf);
-               if (ret)
-                       return ret;
+               if (likely(!userfaultfd_minor(vmf->vma))) {
+                       ret = do_fault_around(vmf);
+                       if (ret)
+                               return ret;
+               }
        }
 
        ret = __do_fault(vmf);
@@ -4078,9 +4302,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
        return ret;
 }
 
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int page_nid,
-                               int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                     unsigned long addr, int page_nid, int *flags)
 {
        get_page(page);
 
@@ -4100,7 +4323,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        int page_nid = NUMA_NO_NODE;
        int last_cpupid;
        int target_nid;
-       bool migrated = false;
        pte_t pte, old_pte;
        bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;
@@ -4117,29 +4339,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
                goto out;
        }
 
-       /*
-        * Make it present again, Depending on how arch implementes non
-        * accessible ptes, some can allow access by kernel mode.
-        */
-       old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+       /* Get the normal PTE  */
+       old_pte = ptep_get(vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
-       pte = pte_mkyoung(pte);
-       if (was_writable)
-               pte = pte_mkwrite(pte);
-       ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
-       update_mmu_cache(vma, vmf->address, vmf->pte);
 
        page = vm_normal_page(vma, vmf->address, pte);
-       if (!page) {
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-               return 0;
-       }
+       if (!page)
+               goto out_map;
 
        /* TODO: handle PTE-mapped THP */
-       if (PageCompound(page)) {
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-               return 0;
-       }
+       if (PageCompound(page))
+               goto out_map;
 
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4149,7 +4359,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
-       if (!pte_write(pte))
+       if (!was_writable)
                flags |= TNF_NO_GROUP;
 
        /*
@@ -4163,24 +4373,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        page_nid = page_to_nid(page);
        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == NUMA_NO_NODE) {
                put_page(page);
-               goto out;
+               goto out_map;
        }
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 
        /* Migrate to the requested node */
-       migrated = migrate_misplaced_page(page, vma, target_nid);
-       if (migrated) {
+       if (migrate_misplaced_page(page, vma, target_nid)) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
-       } else
+       } else {
                flags |= TNF_MIGRATE_FAIL;
+               vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+               spin_lock(vmf->ptl);
+               if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       goto out;
+               }
+               goto out_map;
+       }
 
 out:
        if (page_nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, page_nid, 1, flags);
        return 0;
+out_map:
+       /*
+        * Make it present again, depending on how arch implements
+        * non-accessible ptes, some can allow access by kernel mode.
+        */
+       old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+       pte = pte_modify(old_pte, vma->vm_page_prot);
+       pte = pte_mkyoung(pte);
+       if (was_writable)
+               pte = pte_mkwrite(pte);
+       ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       goto out;
 }
 
 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
@@ -4193,12 +4424,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 }
 
 /* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
        if (vma_is_anonymous(vmf->vma)) {
-               if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+               if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
                        return handle_userfault(vmf, VM_UFFD_WP);
-               return do_huge_pmd_wp_page(vmf, orig_pmd);
+               return do_huge_pmd_wp_page(vmf);
        }
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -4425,26 +4656,26 @@ retry_pud:
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
-               pmd_t orig_pmd = *vmf.pmd;
+               vmf.orig_pmd = *vmf.pmd;
 
                barrier();
-               if (unlikely(is_swap_pmd(orig_pmd))) {
+               if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
-                                         !is_pmd_migration_entry(orig_pmd));
-                       if (is_pmd_migration_entry(orig_pmd))
+                                         !is_pmd_migration_entry(vmf.orig_pmd));
+                       if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
-               if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-                       if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
-                               return do_huge_pmd_numa_page(&vmf, orig_pmd);
+               if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+                       if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+                               return do_huge_pmd_numa_page(&vmf);
 
-                       if (dirty && !pmd_write(orig_pmd)) {
-                               ret = wp_huge_pmd(&vmf, orig_pmd);
+                       if (dirty && !pmd_write(vmf.orig_pmd)) {
+                               ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
-                               huge_pmd_set_accessed(&vmf, orig_pmd);
+                               huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }
@@ -4454,7 +4685,7 @@ retry_pud:
 }
 
 /**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
  *
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
@@ -4463,9 +4694,9 @@ retry_pud:
  * @flags: the fault flags.
  * @ret: the fault retcode.
  *
- * This will take care of most of the page fault accountings.  Meanwhile, it
+ * This will take care of most of the page fault accounting.  Meanwhile, it
  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
 static inline void mm_account_fault(struct pt_regs *regs,
@@ -4799,7 +5030,7 @@ out:
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
  * @buf: buffer to read/write
  * @len: length of transfer
  * @write: set to FOLL_WRITE when writing, otherwise reading
@@ -4891,8 +5122,8 @@ int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
                         * Check if this is a VM_IO | VM_PFNMAP VMA, which
                         * we can access using slightly different code.
                         */
-                       vma = find_vma(mm, addr);
-                       if (!vma || vma->vm_start > addr)
+                       vma = vma_lookup(mm, addr);
+                       if (!vma)
                                break;
                        if (vma->vm_ops && vma->vm_ops->access)
                                ret = vma->vm_ops->access(vma, addr, buf,