drm/i915: Fix fall-through warning for Clang
[linux-2.6-microblaze.git] / mm / memory.c
index 48c4576..747a01d 100644 (file)
@@ -699,6 +699,68 @@ out:
 }
 #endif
 
+static void restore_exclusive_pte(struct vm_area_struct *vma,
+                                 struct page *page, unsigned long address,
+                                 pte_t *ptep)
+{
+       pte_t pte;
+       swp_entry_t entry;
+
+       pte = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
+       if (pte_swp_soft_dirty(*ptep))
+               pte = pte_mksoft_dirty(pte);
+
+       entry = pte_to_swp_entry(*ptep);
+       if (pte_swp_uffd_wp(*ptep))
+               pte = pte_mkuffd_wp(pte);
+       else if (is_writable_device_exclusive_entry(entry))
+               pte = maybe_mkwrite(pte_mkdirty(pte), vma);
+
+       set_pte_at(vma->vm_mm, address, ptep, pte);
+
+       /*
+        * No need to take a page reference as one was already
+        * created when the swap entry was made.
+        */
+       if (PageAnon(page))
+               page_add_anon_rmap(page, vma, address, false);
+       else
+               /*
+                * Currently device exclusive access only supports anonymous
+                * memory so the entry shouldn't point to a filebacked page.
+                */
+               WARN_ON_ONCE(!PageAnon(page));
+
+       if (vma->vm_flags & VM_LOCKED)
+               mlock_vma_page(page);
+
+       /*
+        * No need to invalidate - it was non-present before. However
+        * secondary CPUs may have mappings that need invalidating.
+        */
+       update_mmu_cache(vma, address, ptep);
+}
+
+/*
+ * Tries to restore an exclusive pte if the page lock can be acquired without
+ * sleeping.
+ */
+static int
+try_restore_exclusive_pte(pte_t *src_pte, struct vm_area_struct *vma,
+                       unsigned long addr)
+{
+       swp_entry_t entry = pte_to_swp_entry(*src_pte);
+       struct page *page = pfn_swap_entry_to_page(entry);
+
+       if (trylock_page(page)) {
+               restore_exclusive_pte(vma, page, addr, src_pte);
+               unlock_page(page);
+               return 0;
+       }
+
+       return -EBUSY;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -707,17 +769,17 @@ out:
 
 static unsigned long
 copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
-               unsigned long addr, int *rss)
+               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma,
+               struct vm_area_struct *src_vma, unsigned long addr, int *rss)
 {
-       unsigned long vm_flags = vma->vm_flags;
+       unsigned long vm_flags = dst_vma->vm_flags;
        pte_t pte = *src_pte;
        struct page *page;
        swp_entry_t entry = pte_to_swp_entry(pte);
 
        if (likely(!non_swap_entry(entry))) {
                if (swap_duplicate(entry) < 0)
-                       return entry.val;
+                       return -EIO;
 
                /* make sure dst_mm is on swapoff's mmlist. */
                if (unlikely(list_empty(&dst_mm->mmlist))) {
@@ -729,17 +791,18 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                }
                rss[MM_SWAPENTS]++;
        } else if (is_migration_entry(entry)) {
-               page = migration_entry_to_page(entry);
+               page = pfn_swap_entry_to_page(entry);
 
                rss[mm_counter(page)]++;
 
-               if (is_write_migration_entry(entry) &&
+               if (is_writable_migration_entry(entry) &&
                                is_cow_mapping(vm_flags)) {
                        /*
                         * COW mappings require pages in both
                         * parent and child to be set to read.
                         */
-                       make_migration_entry_read(&entry);
+                       entry = make_readable_migration_entry(
+                                                       swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_soft_dirty(*src_pte))
                                pte = pte_swp_mksoft_dirty(pte);
@@ -748,7 +811,7 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
        } else if (is_device_private_entry(entry)) {
-               page = device_private_entry_to_page(entry);
+               page = pfn_swap_entry_to_page(entry);
 
                /*
                 * Update rss count even for unaddressable pages, as
@@ -770,15 +833,29 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 * when a device driver is involved (you cannot easily
                 * save and restore device driver state).
                 */
-               if (is_write_device_private_entry(entry) &&
+               if (is_writable_device_private_entry(entry) &&
                    is_cow_mapping(vm_flags)) {
-                       make_device_private_entry_read(&entry);
+                       entry = make_readable_device_private_entry(
+                                                       swp_offset(entry));
                        pte = swp_entry_to_pte(entry);
                        if (pte_swp_uffd_wp(*src_pte))
                                pte = pte_swp_mkuffd_wp(pte);
                        set_pte_at(src_mm, addr, src_pte, pte);
                }
+       } else if (is_device_exclusive_entry(entry)) {
+               /*
+                * Make device exclusive entries present by restoring the
+                * original entry then copying as for a present pte. Device
+                * exclusive entries currently only support private writable
+                * (ie. COW) mappings.
+                */
+               VM_BUG_ON(!is_cow_mapping(src_vma->vm_flags));
+               if (try_restore_exclusive_pte(src_pte, src_vma, addr))
+                       return -EBUSY;
+               return -ENOENT;
        }
+       if (!userfaultfd_wp(dst_vma))
+               pte = pte_swp_clear_uffd_wp(pte);
        set_pte_at(dst_mm, addr, dst_pte, pte);
        return 0;
 }
@@ -844,6 +921,9 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
        /* All done, just insert the new page copy in the child */
        pte = mk_pte(new_page, dst_vma->vm_page_prot);
        pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma);
+       if (userfaultfd_pte_wp(dst_vma, *src_pte))
+               /* Uffd-wp needs to be delivered to dest pte as well */
+               pte = pte_wrprotect(pte_mkuffd_wp(pte));
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
        return 0;
 }
@@ -893,12 +973,7 @@ copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                pte = pte_mkclean(pte);
        pte = pte_mkold(pte);
 
-       /*
-        * Make sure the _PAGE_UFFD_WP bit is cleared if the new VMA
-        * does not have the VM_UFFD_WP, which means that the uffd
-        * fork event is not enabled.
-        */
-       if (!(vm_flags & VM_UFFD_WP))
+       if (!userfaultfd_wp(dst_vma))
                pte = pte_clear_uffd_wp(pte);
 
        set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte);
@@ -971,13 +1046,25 @@ again:
                        continue;
                }
                if (unlikely(!pte_present(*src_pte))) {
-                       entry.val = copy_nonpresent_pte(dst_mm, src_mm,
-                                                       dst_pte, src_pte,
-                                                       src_vma, addr, rss);
-                       if (entry.val)
+                       ret = copy_nonpresent_pte(dst_mm, src_mm,
+                                                 dst_pte, src_pte,
+                                                 dst_vma, src_vma,
+                                                 addr, rss);
+                       if (ret == -EIO) {
+                               entry = pte_to_swp_entry(*src_pte);
                                break;
-                       progress += 8;
-                       continue;
+                       } else if (ret == -EBUSY) {
+                               break;
+                       } else if (!ret) {
+                               progress += 8;
+                               continue;
+                       }
+
+                       /*
+                        * Device exclusive entry restored, continue by copying
+                        * the now present pte.
+                        */
+                       WARN_ON_ONCE(ret != -ENOENT);
                }
                /* copy_present_pte() will clear `*prealloc' if consumed */
                ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte,
@@ -1008,20 +1095,26 @@ again:
        pte_unmap_unlock(orig_dst_pte, dst_ptl);
        cond_resched();
 
-       if (entry.val) {
+       if (ret == -EIO) {
+               VM_WARN_ON_ONCE(!entry.val);
                if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
                        ret = -ENOMEM;
                        goto out;
                }
                entry.val = 0;
-       } else if (ret) {
-               WARN_ON_ONCE(ret != -EAGAIN);
+       } else if (ret == -EBUSY) {
+               goto out;
+       } else if (ret ==  -EAGAIN) {
                prealloc = page_copy_prealloc(src_mm, src_vma, addr);
                if (!prealloc)
                        return -ENOMEM;
-               /* We've captured and resolved the error. Reset, try again. */
-               ret = 0;
+       } else if (ret) {
+               VM_WARN_ON_ONCE(1);
        }
+
+       /* We've captured and resolved the error. Reset, try again. */
+       ret = 0;
+
        if (addr != end)
                goto again;
 out:
@@ -1050,8 +1143,8 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
                        || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma);
-                       err = copy_huge_pmd(dst_mm, src_mm,
-                                           dst_pmd, src_pmd, addr, src_vma);
+                       err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd,
+                                           addr, dst_vma, src_vma);
                        if (err == -ENOMEM)
                                return -ENOMEM;
                        if (!err)
@@ -1278,8 +1371,9 @@ again:
                }
 
                entry = pte_to_swp_entry(ptent);
-               if (is_device_private_entry(entry)) {
-                       struct page *page = device_private_entry_to_page(entry);
+               if (is_device_private_entry(entry) ||
+                   is_device_exclusive_entry(entry)) {
+                       struct page *page = pfn_swap_entry_to_page(entry);
 
                        if (unlikely(details && details->check_mapping)) {
                                /*
@@ -1294,7 +1388,10 @@ again:
 
                        pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
                        rss[mm_counter(page)]--;
-                       page_remove_rmap(page, false);
+
+                       if (is_device_private_entry(entry))
+                               page_remove_rmap(page, false);
+
                        put_page(page);
                        continue;
                }
@@ -1308,7 +1405,7 @@ again:
                else if (is_migration_entry(entry)) {
                        struct page *page;
 
-                       page = migration_entry_to_page(entry);
+                       page = pfn_swap_entry_to_page(entry);
                        rss[mm_counter(page)]--;
                }
                if (unlikely(!free_swap_and_cache(entry)))
@@ -3342,6 +3439,34 @@ void unmap_mapping_range(struct address_space *mapping,
 }
 EXPORT_SYMBOL(unmap_mapping_range);
 
+/*
+ * Restore a potential device exclusive pte to a working pte entry
+ */
+static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
+{
+       struct page *page = vmf->page;
+       struct vm_area_struct *vma = vmf->vma;
+       struct mmu_notifier_range range;
+
+       if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags))
+               return VM_FAULT_RETRY;
+       mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma,
+                               vma->vm_mm, vmf->address & PAGE_MASK,
+                               (vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
+       mmu_notifier_invalidate_range_start(&range);
+
+       vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
+                               &vmf->ptl);
+       if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
+               restore_exclusive_pte(vma, page, vmf->address, vmf->pte);
+
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       unlock_page(page);
+
+       mmu_notifier_invalidate_range_end(&range);
+       return 0;
+}
+
 /*
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), and pte mapped but not yet locked.
@@ -3370,8 +3495,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                if (is_migration_entry(entry)) {
                        migration_entry_wait(vma->vm_mm, vmf->pmd,
                                             vmf->address);
+               } else if (is_device_exclusive_entry(entry)) {
+                       vmf->page = pfn_swap_entry_to_page(entry);
+                       ret = remove_device_exclusive_entry(vmf);
                } else if (is_device_private_entry(entry)) {
-                       vmf->page = device_private_entry_to_page(entry);
+                       vmf->page = pfn_swap_entry_to_page(entry);
                        ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
                } else if (is_hwpoison_entry(entry)) {
                        ret = VM_FAULT_HWPOISON;
@@ -4025,9 +4153,11 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
         * something).
         */
        if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
-               ret = do_fault_around(vmf);
-               if (ret)
-                       return ret;
+               if (likely(!userfaultfd_minor(vmf->vma))) {
+                       ret = do_fault_around(vmf);
+                       if (ret)
+                               return ret;
+               }
        }
 
        ret = __do_fault(vmf);
@@ -4172,9 +4302,8 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
        return ret;
 }
 
-static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
-                               unsigned long addr, int page_nid,
-                               int *flags)
+int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
+                     unsigned long addr, int page_nid, int *flags)
 {
        get_page(page);
 
@@ -4295,12 +4424,12 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 }
 
 /* `inline' is required to avoid gcc 4.1.2 build error */
-static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd)
+static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
        if (vma_is_anonymous(vmf->vma)) {
-               if (userfaultfd_huge_pmd_wp(vmf->vma, orig_pmd))
+               if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
                        return handle_userfault(vmf, VM_UFFD_WP);
-               return do_huge_pmd_wp_page(vmf, orig_pmd);
+               return do_huge_pmd_wp_page(vmf);
        }
        if (vmf->vma->vm_ops->huge_fault) {
                vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PMD);
@@ -4527,26 +4656,26 @@ retry_pud:
                if (!(ret & VM_FAULT_FALLBACK))
                        return ret;
        } else {
-               pmd_t orig_pmd = *vmf.pmd;
+               vmf.orig_pmd = *vmf.pmd;
 
                barrier();
-               if (unlikely(is_swap_pmd(orig_pmd))) {
+               if (unlikely(is_swap_pmd(vmf.orig_pmd))) {
                        VM_BUG_ON(thp_migration_supported() &&
-                                         !is_pmd_migration_entry(orig_pmd));
-                       if (is_pmd_migration_entry(orig_pmd))
+                                         !is_pmd_migration_entry(vmf.orig_pmd));
+                       if (is_pmd_migration_entry(vmf.orig_pmd))
                                pmd_migration_entry_wait(mm, vmf.pmd);
                        return 0;
                }
-               if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
-                       if (pmd_protnone(orig_pmd) && vma_is_accessible(vma))
-                               return do_huge_pmd_numa_page(&vmf, orig_pmd);
+               if (pmd_trans_huge(vmf.orig_pmd) || pmd_devmap(vmf.orig_pmd)) {
+                       if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+                               return do_huge_pmd_numa_page(&vmf);
 
-                       if (dirty && !pmd_write(orig_pmd)) {
-                               ret = wp_huge_pmd(&vmf, orig_pmd);
+                       if (dirty && !pmd_write(vmf.orig_pmd)) {
+                               ret = wp_huge_pmd(&vmf);
                                if (!(ret & VM_FAULT_FALLBACK))
                                        return ret;
                        } else {
-                               huge_pmd_set_accessed(&vmf, orig_pmd);
+                               huge_pmd_set_accessed(&vmf);
                                return 0;
                        }
                }