tools headers UAPI: Sync linux/prctl.h with the kernel sources
[linux-2.6-microblaze.git] / mm / memory.c
index c8e3576..730daa0 100644 (file)
@@ -166,7 +166,7 @@ static int __init init_zero_pfn(void)
        zero_pfn = page_to_pfn(ZERO_PAGE(0));
        return 0;
 }
-core_initcall(init_zero_pfn);
+early_initcall(init_zero_pfn);
 
 void mm_trace_rss_stat(struct mm_struct *mm, int member, long count)
 {
@@ -809,12 +809,8 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
                  pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss,
                  struct page **prealloc, pte_t pte, struct page *page)
 {
-       struct mm_struct *src_mm = src_vma->vm_mm;
        struct page *new_page;
 
-       if (!is_cow_mapping(src_vma->vm_flags))
-               return 1;
-
        /*
         * What we want to do is to check whether this page may
         * have been pinned by the parent process.  If so,
@@ -828,9 +824,7 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
         * the page count. That might give false positives for
         * for pinning, but it will work correctly.
         */
-       if (likely(!atomic_read(&src_mm->has_pinned)))
-               return 1;
-       if (likely(!page_maybe_dma_pinned(page)))
+       if (likely(!page_needs_cow_for_dma(src_vma, page)))
                return 1;
 
        new_page = *prealloc;
@@ -2266,26 +2260,17 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
        return 0;
 }
 
-/**
- * remap_pfn_range - remap kernel memory to userspace
- * @vma: user vma to map to
- * @addr: target page aligned user address to start at
- * @pfn: page frame number of kernel physical memory address
- * @size: size of mapping area
- * @prot: page protection flags for this mapping
- *
- * Note: this is only safe if the mm semaphore is held when called.
- *
- * Return: %0 on success, negative error code otherwise.
+/*
+ * Variant of remap_pfn_range that does not call track_pfn_remap.  The caller
+ * must have pre-validated the caching bits of the pgprot_t.
  */
-int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
-                   unsigned long pfn, unsigned long size, pgprot_t prot)
+int remap_pfn_range_notrack(struct vm_area_struct *vma, unsigned long addr,
+               unsigned long pfn, unsigned long size, pgprot_t prot)
 {
        pgd_t *pgd;
        unsigned long next;
        unsigned long end = addr + PAGE_ALIGN(size);
        struct mm_struct *mm = vma->vm_mm;
-       unsigned long remap_pfn = pfn;
        int err;
 
        if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
@@ -2315,10 +2300,6 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                vma->vm_pgoff = pfn;
        }
 
-       err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size));
-       if (err)
-               return -EINVAL;
-
        vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP;
 
        BUG_ON(addr >= end);
@@ -2330,12 +2311,36 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
                err = remap_p4d_range(mm, pgd, addr, next,
                                pfn + (addr >> PAGE_SHIFT), prot);
                if (err)
-                       break;
+                       return err;
        } while (pgd++, addr = next, addr != end);
 
+       return 0;
+}
+
+/**
+ * remap_pfn_range - remap kernel memory to userspace
+ * @vma: user vma to map to
+ * @addr: target page aligned user address to start at
+ * @pfn: page frame number of kernel physical memory address
+ * @size: size of mapping area
+ * @prot: page protection flags for this mapping
+ *
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
+ */
+int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
+                   unsigned long pfn, unsigned long size, pgprot_t prot)
+{
+       int err;
+
+       err = track_pfn_remap(vma, &prot, pfn, addr, PAGE_ALIGN(size));
        if (err)
-               untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size));
+               return -EINVAL;
 
+       err = remap_pfn_range_notrack(vma, addr, pfn, size, prot);
+       if (err)
+               untrack_pfn(vma, pfn, PAGE_ALIGN(size));
        return err;
 }
 EXPORT_SYMBOL(remap_pfn_range);
@@ -2452,13 +2457,21 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
        }
        do {
                next = pmd_addr_end(addr, end);
-               if (create || !pmd_none_or_clear_bad(pmd)) {
-                       err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (pmd_none(*pmd) && !create)
+                       continue;
+               if (WARN_ON_ONCE(pmd_leaf(*pmd)))
+                       return -EINVAL;
+               if (!pmd_none(*pmd) && WARN_ON_ONCE(pmd_bad(*pmd))) {
+                       if (!create)
+                               continue;
+                       pmd_clear_bad(pmd);
                }
+               err = apply_to_pte_range(mm, pmd, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (pmd++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2480,13 +2493,21 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
        }
        do {
                next = pud_addr_end(addr, end);
-               if (create || !pud_none_or_clear_bad(pud)) {
-                       err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (pud_none(*pud) && !create)
+                       continue;
+               if (WARN_ON_ONCE(pud_leaf(*pud)))
+                       return -EINVAL;
+               if (!pud_none(*pud) && WARN_ON_ONCE(pud_bad(*pud))) {
+                       if (!create)
+                               continue;
+                       pud_clear_bad(pud);
                }
+               err = apply_to_pmd_range(mm, pud, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (pud++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2508,13 +2529,21 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
        }
        do {
                next = p4d_addr_end(addr, end);
-               if (create || !p4d_none_or_clear_bad(p4d)) {
-                       err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
-                                                create, mask);
-                       if (err)
-                               break;
+               if (p4d_none(*p4d) && !create)
+                       continue;
+               if (WARN_ON_ONCE(p4d_leaf(*p4d)))
+                       return -EINVAL;
+               if (!p4d_none(*p4d) && WARN_ON_ONCE(p4d_bad(*p4d))) {
+                       if (!create)
+                               continue;
+                       p4d_clear_bad(p4d);
                }
+               err = apply_to_pud_range(mm, p4d, addr, next,
+                                        fn, data, create, mask);
+               if (err)
+                       break;
        } while (p4d++, addr = next, addr != end);
+
        return err;
 }
 
@@ -2534,9 +2563,17 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
        pgd = pgd_offset(mm, addr);
        do {
                next = pgd_addr_end(addr, end);
-               if (!create && pgd_none_or_clear_bad(pgd))
+               if (pgd_none(*pgd) && !create)
                        continue;
-               err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
+               if (WARN_ON_ONCE(pgd_leaf(*pgd)))
+                       return -EINVAL;
+               if (!pgd_none(*pgd) && WARN_ON_ONCE(pgd_bad(*pgd))) {
+                       if (!create)
+                               continue;
+                       pgd_clear_bad(pgd);
+               }
+               err = apply_to_p4d_range(mm, pgd, addr, next,
+                                        fn, data, create, &mask);
                if (err)
                        break;
        } while (pgd++, addr = next, addr != end);
@@ -3103,6 +3140,14 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
                return handle_userfault(vmf, VM_UFFD_WP);
        }
 
+       /*
+        * Userfaultfd write-protect can defer flushes. Ensure the TLB
+        * is flushed in this case before copying.
+        */
+       if (unlikely(userfaultfd_wp(vmf->vma) &&
+                    mm_tlb_flush_pending(vmf->vma->vm_mm)))
+               flush_tlb_page(vmf->vma, vmf->address);
+
        vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
        if (!vmf->page) {
                /*
@@ -3294,7 +3339,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
        }
 
 
-       delayacct_set_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
        page = lookup_swap_cache(entry, vma, vmf->address);
        swapcache = page;
 
@@ -3307,28 +3352,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                                        vmf->address);
                        if (page) {
-                               int err;
-
                                __SetPageLocked(page);
                                __SetPageSwapBacked(page);
-                               set_page_private(page, entry.val);
 
-                               /* Tell memcg to use swap ownership records */
-                               SetPageSwapCache(page);
-                               err = mem_cgroup_charge(page, vma->vm_mm,
-                                                       GFP_KERNEL);
-                               ClearPageSwapCache(page);
-                               if (err) {
+                               if (mem_cgroup_swapin_charge_page(page,
+                                       vma->vm_mm, GFP_KERNEL, entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
+                               mem_cgroup_swapin_uncharge_swap(entry);
 
                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(page, shadow);
 
                                lru_cache_add(page);
+
+                               /* To provide entry to swap_readpage() */
+                               set_page_private(page, entry.val);
                                swap_readpage(page, true);
+                               set_page_private(page, 0);
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
@@ -3345,7 +3388,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                                        vmf->address, &vmf->ptl);
                        if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
                                ret = VM_FAULT_OOM;
-                       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+                       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                        goto unlock;
                }
 
@@ -3359,13 +3402,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                 * owner processes (which may be unknown at hwpoison time)
                 */
                ret = VM_FAULT_HWPOISON;
-               delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+               delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
                goto out_release;
        }
 
        locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
 
-       delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
+       delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
        if (!locked) {
                ret |= VM_FAULT_RETRY;
                goto out_release;
@@ -3684,7 +3727,7 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
                return ret;
 
        /*
-        * Archs like ppc64 need additonal space to store information
+        * Archs like ppc64 need additional space to store information
         * related to pte entry. Use the preallocated table for that.
         */
        if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
@@ -4098,7 +4141,6 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        int page_nid = NUMA_NO_NODE;
        int last_cpupid;
        int target_nid;
-       bool migrated = false;
        pte_t pte, old_pte;
        bool was_writable = pte_savedwrite(vmf->orig_pte);
        int flags = 0;
@@ -4115,29 +4157,17 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
                goto out;
        }
 
-       /*
-        * Make it present again, Depending on how arch implementes non
-        * accessible ptes, some can allow access by kernel mode.
-        */
-       old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+       /* Get the normal PTE  */
+       old_pte = ptep_get(vmf->pte);
        pte = pte_modify(old_pte, vma->vm_page_prot);
-       pte = pte_mkyoung(pte);
-       if (was_writable)
-               pte = pte_mkwrite(pte);
-       ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
-       update_mmu_cache(vma, vmf->address, vmf->pte);
 
        page = vm_normal_page(vma, vmf->address, pte);
-       if (!page) {
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-               return 0;
-       }
+       if (!page)
+               goto out_map;
 
        /* TODO: handle PTE-mapped THP */
-       if (PageCompound(page)) {
-               pte_unmap_unlock(vmf->pte, vmf->ptl);
-               return 0;
-       }
+       if (PageCompound(page))
+               goto out_map;
 
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
@@ -4147,7 +4177,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
         * pte_dirty has unpredictable behaviour between PTE scan updates,
         * background writeback, dirty balancing and application behaviour.
         */
-       if (!pte_write(pte))
+       if (!was_writable)
                flags |= TNF_NO_GROUP;
 
        /*
@@ -4161,24 +4191,45 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
        page_nid = page_to_nid(page);
        target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
                        &flags);
-       pte_unmap_unlock(vmf->pte, vmf->ptl);
        if (target_nid == NUMA_NO_NODE) {
                put_page(page);
-               goto out;
+               goto out_map;
        }
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
 
        /* Migrate to the requested node */
-       migrated = migrate_misplaced_page(page, vma, target_nid);
-       if (migrated) {
+       if (migrate_misplaced_page(page, vma, target_nid)) {
                page_nid = target_nid;
                flags |= TNF_MIGRATED;
-       } else
+       } else {
                flags |= TNF_MIGRATE_FAIL;
+               vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
+               spin_lock(vmf->ptl);
+               if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) {
+                       pte_unmap_unlock(vmf->pte, vmf->ptl);
+                       goto out;
+               }
+               goto out_map;
+       }
 
 out:
        if (page_nid != NUMA_NO_NODE)
                task_numa_fault(last_cpupid, page_nid, 1, flags);
        return 0;
+out_map:
+       /*
+        * Make it present again, depending on how arch implements
+        * non-accessible ptes, some can allow access by kernel mode.
+        */
+       old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+       pte = pte_modify(old_pte, vma->vm_page_prot);
+       pte = pte_mkyoung(pte);
+       if (was_writable)
+               pte = pte_mkwrite(pte);
+       ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
+       update_mmu_cache(vma, vmf->address, vmf->pte);
+       pte_unmap_unlock(vmf->pte, vmf->ptl);
+       goto out;
 }
 
 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
@@ -4452,7 +4503,7 @@ retry_pud:
 }
 
 /**
- * mm_account_fault - Do page fault accountings
+ * mm_account_fault - Do page fault accounting
  *
  * @regs: the pt_regs struct pointer.  When set to NULL, will skip accounting
  *        of perf event counters, but we'll still do the per-task accounting to
@@ -4461,9 +4512,9 @@ retry_pud:
  * @flags: the fault flags.
  * @ret: the fault retcode.
  *
- * This will take care of most of the page fault accountings.  Meanwhile, it
+ * This will take care of most of the page fault accounting.  Meanwhile, it
  * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
- * updates.  However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * updates.  However, note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
  * still be in per-arch page fault handlers at the entry of page fault.
  */
 static inline void mm_account_fault(struct pt_regs *regs,
@@ -4797,7 +4848,7 @@ out:
 /**
  * generic_access_phys - generic implementation for iomem mmap access
  * @vma: the vma to access
- * @addr: userspace addres, not relative offset within @vma
+ * @addr: userspace address, not relative offset within @vma
  * @buf: buffer to read/write
  * @len: length of transfer
  * @write: set to FOLL_WRITE when writing, otherwise reading