fs/hugetlbfs/inode.c: fix bugs in hugetlb_vmtruncate_list()
[linux-2.6-microblaze.git] / mm / memory.c
index c387430..ff17850 100644 (file)
@@ -50,6 +50,7 @@
 #include <linux/export.h>
 #include <linux/delayacct.h>
 #include <linux/init.h>
+#include <linux/pfn_t.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
 #include <linux/mmu_notifier.h>
@@ -566,7 +567,6 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
 {
        spinlock_t *ptl;
        pgtable_t new = pte_alloc_one(mm, address);
-       int wait_split_huge_page;
        if (!new)
                return -ENOMEM;
 
@@ -586,18 +586,14 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
        smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
 
        ptl = pmd_lock(mm, pmd);
-       wait_split_huge_page = 0;
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                atomic_long_inc(&mm->nr_ptes);
                pmd_populate(mm, pmd, new);
                new = NULL;
-       } else if (unlikely(pmd_trans_splitting(*pmd)))
-               wait_split_huge_page = 1;
+       }
        spin_unlock(ptl);
        if (new)
                pte_free(mm, new);
-       if (wait_split_huge_page)
-               wait_split_huge_page(vma->anon_vma, pmd);
        return 0;
 }
 
@@ -613,8 +609,7 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
        if (likely(pmd_none(*pmd))) {   /* Has another populated it ? */
                pmd_populate_kernel(&init_mm, pmd, new);
                new = NULL;
-       } else
-               VM_BUG_ON(pmd_trans_splitting(*pmd));
+       }
        spin_unlock(&init_mm.page_table_lock);
        if (new)
                pte_free_kernel(&init_mm, new);
@@ -832,10 +827,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                } else if (is_migration_entry(entry)) {
                        page = migration_entry_to_page(entry);
 
-                       if (PageAnon(page))
-                               rss[MM_ANONPAGES]++;
-                       else
-                               rss[MM_FILEPAGES]++;
+                       rss[mm_counter(page)]++;
 
                        if (is_write_migration_entry(entry) &&
                                        is_cow_mapping(vm_flags)) {
@@ -873,11 +865,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
        page = vm_normal_page(vma, addr, pte);
        if (page) {
                get_page(page);
-               page_dup_rmap(page);
-               if (PageAnon(page))
-                       rss[MM_ANONPAGES]++;
-               else
-                       rss[MM_FILEPAGES]++;
+               page_dup_rmap(page, false);
+               rss[mm_counter(page)]++;
        }
 
 out_set_pte:
@@ -961,7 +950,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
        src_pmd = pmd_offset(src_pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*src_pmd)) {
+               if (pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) {
                        int err;
                        VM_BUG_ON(next-addr != HPAGE_PMD_SIZE);
                        err = copy_huge_pmd(dst_mm, src_mm,
@@ -1113,9 +1102,8 @@ again:
                        tlb_remove_tlb_entry(tlb, pte, addr);
                        if (unlikely(!page))
                                continue;
-                       if (PageAnon(page))
-                               rss[MM_ANONPAGES]--;
-                       else {
+
+                       if (!PageAnon(page)) {
                                if (pte_dirty(ptent)) {
                                        force_flush = 1;
                                        set_page_dirty(page);
@@ -1123,9 +1111,9 @@ again:
                                if (pte_young(ptent) &&
                                    likely(!(vma->vm_flags & VM_SEQ_READ)))
                                        mark_page_accessed(page);
-                               rss[MM_FILEPAGES]--;
                        }
-                       page_remove_rmap(page);
+                       rss[mm_counter(page)]--;
+                       page_remove_rmap(page, false);
                        if (unlikely(page_mapcount(page) < 0))
                                print_bad_pte(vma, addr, ptent, page);
                        if (unlikely(!__tlb_remove_page(tlb, page))) {
@@ -1146,11 +1134,7 @@ again:
                        struct page *page;
 
                        page = migration_entry_to_page(entry);
-
-                       if (PageAnon(page))
-                               rss[MM_ANONPAGES]--;
-                       else
-                               rss[MM_FILEPAGES]--;
+                       rss[mm_counter(page)]--;
                }
                if (unlikely(!free_swap_and_cache(entry)))
                        print_bad_pte(vma, addr, ptent, NULL);
@@ -1193,7 +1177,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
        pmd = pmd_offset(pud, addr);
        do {
                next = pmd_addr_end(addr, end);
-               if (pmd_trans_huge(*pmd)) {
+               if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
                        if (next - addr != HPAGE_PMD_SIZE) {
 #ifdef CONFIG_DEBUG_VM
                                if (!rwsem_is_locked(&tlb->mm->mmap_sem)) {
@@ -1204,7 +1188,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                        BUG();
                                }
 #endif
-                               split_huge_page_pmd(vma, addr, pmd);
+                               split_huge_pmd(vma, pmd, addr);
                        } else if (zap_huge_pmd(tlb, vma, pmd, addr))
                                goto next;
                        /* fall through */
@@ -1460,7 +1444,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 
        /* Ok, finally just insert the thing.. */
        get_page(page);
-       inc_mm_counter_fast(mm, MM_FILEPAGES);
+       inc_mm_counter_fast(mm, mm_counter_file(page));
        page_add_file_rmap(page);
        set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
@@ -1517,7 +1501,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
 EXPORT_SYMBOL(vm_insert_page);
 
 static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
-                       unsigned long pfn, pgprot_t prot)
+                       pfn_t pfn, pgprot_t prot)
 {
        struct mm_struct *mm = vma->vm_mm;
        int retval;
@@ -1533,7 +1517,10 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                goto out_unlock;
 
        /* Ok, finally just insert the thing.. */
-       entry = pte_mkspecial(pfn_pte(pfn, prot));
+       if (pfn_t_devmap(pfn))
+               entry = pte_mkdevmap(pfn_t_pte(pfn, prot));
+       else
+               entry = pte_mkspecial(pfn_t_pte(pfn, prot));
        set_pte_at(mm, addr, pte, entry);
        update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
 
@@ -1580,17 +1567,17 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
 
        if (addr < vma->vm_start || addr >= vma->vm_end)
                return -EFAULT;
-       if (track_pfn_insert(vma, &pgprot, pfn))
+       if (track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)))
                return -EINVAL;
 
-       ret = insert_pfn(vma, addr, pfn, pgprot);
+       ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot);
 
        return ret;
 }
 EXPORT_SYMBOL(vm_insert_pfn);
 
 int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
-                       unsigned long pfn)
+                       pfn_t pfn)
 {
        BUG_ON(!(vma->vm_flags & VM_MIXEDMAP));
 
@@ -1604,10 +1591,10 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
         * than insert_pfn).  If a zero_pfn were inserted into a VM_MIXEDMAP
         * without pte special, it would there be refcounted as a normal page.
         */
-       if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) {
+       if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
                struct page *page;
 
-               page = pfn_to_page(pfn);
+               page = pfn_t_to_page(pfn);
                return insert_page(vma, addr, page, vma->vm_page_prot);
        }
        return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -1949,6 +1936,20 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                copy_user_highpage(dst, src, va, vma);
 }
 
+static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
+{
+       struct file *vm_file = vma->vm_file;
+
+       if (vm_file)
+               return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO;
+
+       /*
+        * Special mappings (e.g. VDSO) do not have any file so fake
+        * a default GFP_KERNEL for them.
+        */
+       return GFP_KERNEL;
+}
+
 /*
  * Notify the address space that the page is about to become writable so that
  * it can prohibit this or wait for the page to get into an appropriate state.
@@ -1964,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        vmf.virtual_address = (void __user *)(address & PAGE_MASK);
        vmf.pgoff = page->index;
        vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+       vmf.gfp_mask = __get_fault_gfp_mask(vma);
        vmf.page = page;
        vmf.cow_page = NULL;
 
@@ -2083,7 +2085,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                cow_user_page(new_page, old_page, address, vma);
        }
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_new;
 
        __SetPageUptodate(new_page);
@@ -2097,7 +2099,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
        if (likely(pte_same(*page_table, orig_pte))) {
                if (old_page) {
                        if (!PageAnon(old_page)) {
-                               dec_mm_counter_fast(mm, MM_FILEPAGES);
+                               dec_mm_counter_fast(mm,
+                                               mm_counter_file(old_page));
                                inc_mm_counter_fast(mm, MM_ANONPAGES);
                        }
                } else {
@@ -2113,8 +2116,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 * thread doing COW.
                 */
                ptep_clear_flush_notify(vma, address, page_table);
-               page_add_new_anon_rmap(new_page, vma, address);
-               mem_cgroup_commit_charge(new_page, memcg, false);
+               page_add_new_anon_rmap(new_page, vma, address, false);
+               mem_cgroup_commit_charge(new_page, memcg, false, false);
                lru_cache_add_active_or_unevictable(new_page, vma);
                /*
                 * We call the notify macro here because, when using secondary
@@ -2146,14 +2149,14 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                         * mapcount is visible. So transitively, TLBs to
                         * old page will be flushed before it can be reused.
                         */
-                       page_remove_rmap(old_page);
+                       page_remove_rmap(old_page, false);
                }
 
                /* Free the old page.. */
                new_page = old_page;
                page_copied = 1;
        } else {
-               mem_cgroup_cancel_charge(new_page, memcg);
+               mem_cgroup_cancel_charge(new_page, memcg, false);
        }
 
        if (new_page)
@@ -2168,7 +2171,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
                 */
                if (page_copied && (vma->vm_flags & VM_LOCKED)) {
                        lock_page(old_page);    /* LRU manipulation */
-                       munlock_vma_page(old_page);
+                       if (PageMlocked(old_page))
+                               munlock_vma_page(old_page);
                        unlock_page(old_page);
                }
                page_cache_release(old_page);
@@ -2528,7 +2532,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                goto out_page;
        }
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false)) {
                ret = VM_FAULT_OOM;
                goto out_page;
        }
@@ -2562,7 +2566,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                flags &= ~FAULT_FLAG_WRITE;
                ret |= VM_FAULT_WRITE;
-               exclusive = 1;
+               exclusive = RMAP_EXCLUSIVE;
        }
        flush_icache_page(vma, page);
        if (pte_swp_soft_dirty(orig_pte))
@@ -2570,10 +2574,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
        set_pte_at(mm, address, page_table, pte);
        if (page == swapcache) {
                do_page_add_anon_rmap(page, vma, address, exclusive);
-               mem_cgroup_commit_charge(page, memcg, true);
+               mem_cgroup_commit_charge(page, memcg, true, false);
        } else { /* ksm created a completely new copy */
-               page_add_new_anon_rmap(page, vma, address);
-               mem_cgroup_commit_charge(page, memcg, false);
+               page_add_new_anon_rmap(page, vma, address, false);
+               mem_cgroup_commit_charge(page, memcg, false, false);
                lru_cache_add_active_or_unevictable(page, vma);
        }
 
@@ -2608,7 +2612,7 @@ unlock:
 out:
        return ret;
 out_nomap:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        pte_unmap_unlock(page_table, ptl);
 out_page:
        unlock_page(page);
@@ -2702,7 +2706,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!page)
                goto oom;
 
-       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
+       if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg, false))
                goto oom_free_page;
 
        /*
@@ -2723,15 +2727,15 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
        /* Deliver the page fault to userland, check inside PT lock */
        if (userfaultfd_missing(vma)) {
                pte_unmap_unlock(page_table, ptl);
-               mem_cgroup_cancel_charge(page, memcg);
+               mem_cgroup_cancel_charge(page, memcg, false);
                page_cache_release(page);
                return handle_userfault(vma, address, flags,
                                        VM_UFFD_MISSING);
        }
 
        inc_mm_counter_fast(mm, MM_ANONPAGES);
-       page_add_new_anon_rmap(page, vma, address);
-       mem_cgroup_commit_charge(page, memcg, false);
+       page_add_new_anon_rmap(page, vma, address, false);
+       mem_cgroup_commit_charge(page, memcg, false, false);
        lru_cache_add_active_or_unevictable(page, vma);
 setpte:
        set_pte_at(mm, address, page_table, entry);
@@ -2742,7 +2746,7 @@ unlock:
        pte_unmap_unlock(page_table, ptl);
        return 0;
 release:
-       mem_cgroup_cancel_charge(page, memcg);
+       mem_cgroup_cancel_charge(page, memcg, false);
        page_cache_release(page);
        goto unlock;
 oom_free_page:
@@ -2767,6 +2771,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
        vmf.pgoff = pgoff;
        vmf.flags = flags;
        vmf.page = NULL;
+       vmf.gfp_mask = __get_fault_gfp_mask(vma);
        vmf.cow_page = cow_page;
 
        ret = vma->vm_ops->fault(vma, &vmf);
@@ -2818,9 +2823,9 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
        if (anon) {
                inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
-               page_add_new_anon_rmap(page, vma, address);
+               page_add_new_anon_rmap(page, vma, address, false);
        } else {
-               inc_mm_counter_fast(vma->vm_mm, MM_FILEPAGES);
+               inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
                page_add_file_rmap(page);
        }
        set_pte_at(vma->vm_mm, address, pte, entry);
@@ -2933,6 +2938,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address,
        vmf.pgoff = pgoff;
        vmf.max_pgoff = max_pgoff;
        vmf.flags = flags;
+       vmf.gfp_mask = __get_fault_gfp_mask(vma);
        vma->vm_ops->map_pages(vma, &vmf);
 }
 
@@ -2993,7 +2999,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        if (!new_page)
                return VM_FAULT_OOM;
 
-       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
+       if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) {
                page_cache_release(new_page);
                return VM_FAULT_OOM;
        }
@@ -3022,7 +3028,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                goto uncharge_out;
        }
        do_set_pte(vma, address, new_page, pte, true, true);
-       mem_cgroup_commit_charge(new_page, memcg, false);
+       mem_cgroup_commit_charge(new_page, memcg, false, false);
        lru_cache_add_active_or_unevictable(new_page, vma);
        pte_unmap_unlock(pte, ptl);
        if (fault_page) {
@@ -3037,7 +3043,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        }
        return ret;
 uncharge_out:
-       mem_cgroup_cancel_charge(new_page, memcg);
+       mem_cgroup_cancel_charge(new_page, memcg, false);
        page_cache_release(new_page);
        return ret;
 }
@@ -3089,7 +3095,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
         * pinned by vma->vm_file's reference.  We rely on unlock_page()'s
         * release semantics to prevent the compiler from undoing this copying.
         */
-       mapping = fault_page->mapping;
+       mapping = page_rmapping(fault_page);
        unlock_page(fault_page);
        if ((dirtied || vma->vm_ops->page_mkwrite) && mapping) {
                /*
@@ -3191,6 +3197,12 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
                return 0;
        }
 
+       /* TODO: handle PTE-mapped THP */
+       if (PageCompound(page)) {
+               pte_unmap_unlock(ptep, ptl);
+               return 0;
+       }
+
        /*
         * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
         * much anyway since they can be in shared cache state. This misses
@@ -3363,17 +3375,9 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                int ret;
 
                barrier();
-               if (pmd_trans_huge(orig_pmd)) {
+               if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) {
                        unsigned int dirty = flags & FAULT_FLAG_WRITE;
 
-                       /*
-                        * If the pmd is splitting, return and retry the
-                        * the fault.  Alternative: wait until the split
-                        * is done, and goto retry.
-                        */
-                       if (pmd_trans_splitting(orig_pmd))
-                               return 0;
-
                        if (pmd_protnone(orig_pmd))
                                return do_huge_pmd_numa_page(mm, vma, address,
                                                             orig_pmd, pmd);
@@ -3400,7 +3404,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
            unlikely(__pte_alloc(mm, vma, pmd, address)))
                return VM_FAULT_OOM;
        /* if an huge pmd materialized from under us just retry later */
-       if (unlikely(pmd_trans_huge(*pmd)))
+       if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
                return 0;
        /*
         * A regular pmd is established and it can't morph into a huge pmd