Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / mm / hugetlb.c
index 8fb42c6..a86a58e 100644 (file)
@@ -280,6 +280,17 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
                nrg->reservation_counter =
                        &h_cg->rsvd_hugepage[hstate_index(h)];
                nrg->css = &h_cg->css;
+               /*
+                * The caller will hold exactly one h_cg->css reference for the
+                * whole contiguous reservation region. But this area might be
+                * scattered when there are already some file_regions reside in
+                * it. As a result, many file_regions may share only one css
+                * reference. In order to ensure that one file_region must hold
+                * exactly one h_cg->css reference, we should do css_get for
+                * each file_region and leave the reference held by caller
+                * untouched.
+                */
+               css_get(&h_cg->css);
                if (!resv->pages_per_hpage)
                        resv->pages_per_hpage = pages_per_huge_page(h);
                /* pages_per_hpage should be the same for all entries in
@@ -293,6 +304,14 @@ static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
 #endif
 }
 
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (rg->css)
+               css_put(rg->css);
+#endif
+}
+
 static bool has_same_uncharge_info(struct file_region *rg,
                                   struct file_region *org)
 {
@@ -316,6 +335,7 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
                prg->to = rg->to;
 
                list_del(&rg->link);
+               put_uncharge_info(rg);
                kfree(rg);
 
                rg = prg;
@@ -327,10 +347,29 @@ static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
                nrg->from = rg->from;
 
                list_del(&rg->link);
+               put_uncharge_info(rg);
                kfree(rg);
        }
 }
 
+static inline long
+hugetlb_resv_map_add(struct resv_map *map, struct file_region *rg, long from,
+                    long to, struct hstate *h, struct hugetlb_cgroup *cg,
+                    long *regions_needed)
+{
+       struct file_region *nrg;
+
+       if (!regions_needed) {
+               nrg = get_file_region_entry_from_cache(map, from, to);
+               record_hugetlb_cgroup_uncharge_info(cg, h, map, nrg);
+               list_add(&nrg->link, rg->link.prev);
+               coalesce_file_region(map, nrg);
+       } else
+               *regions_needed += 1;
+
+       return to - from;
+}
+
 /*
  * Must be called with resv->lock held.
  *
@@ -346,7 +385,7 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
        long add = 0;
        struct list_head *head = &resv->regions;
        long last_accounted_offset = f;
-       struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+       struct file_region *rg = NULL, *trg = NULL;
 
        if (regions_needed)
                *regions_needed = 0;
@@ -369,24 +408,17 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
                /* When we find a region that starts beyond our range, we've
                 * finished.
                 */
-               if (rg->from > t)
+               if (rg->from >= t)
                        break;
 
                /* Add an entry for last_accounted_offset -> rg->from, and
                 * update last_accounted_offset.
                 */
-               if (rg->from > last_accounted_offset) {
-                       add += rg->from - last_accounted_offset;
-                       if (!regions_needed) {
-                               nrg = get_file_region_entry_from_cache(
-                                       resv, last_accounted_offset, rg->from);
-                               record_hugetlb_cgroup_uncharge_info(h_cg, h,
-                                                                   resv, nrg);
-                               list_add(&nrg->link, rg->link.prev);
-                               coalesce_file_region(resv, nrg);
-                       } else
-                               *regions_needed += 1;
-               }
+               if (rg->from > last_accounted_offset)
+                       add += hugetlb_resv_map_add(resv, rg,
+                                                   last_accounted_offset,
+                                                   rg->from, h, h_cg,
+                                                   regions_needed);
 
                last_accounted_offset = rg->to;
        }
@@ -394,17 +426,9 @@ static long add_reservation_in_range(struct resv_map *resv, long f, long t,
        /* Handle the case where our range extends beyond
         * last_accounted_offset.
         */
-       if (last_accounted_offset < t) {
-               add += t - last_accounted_offset;
-               if (!regions_needed) {
-                       nrg = get_file_region_entry_from_cache(
-                               resv, last_accounted_offset, t);
-                       record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
-                       list_add(&nrg->link, rg->link.prev);
-                       coalesce_file_region(resv, nrg);
-               } else
-                       *regions_needed += 1;
-       }
+       if (last_accounted_offset < t)
+               add += hugetlb_resv_map_add(resv, rg, last_accounted_offset,
+                                           t, h, h_cg, regions_needed);
 
        VM_BUG_ON(add < 0);
        return add;
@@ -659,7 +683,7 @@ retry:
 
                        del += t - f;
                        hugetlb_cgroup_uncharge_file_region(
-                               resv, rg, t - f);
+                               resv, rg, t - f, false);
 
                        /* New entry for end of split region */
                        nrg->from = t;
@@ -680,7 +704,7 @@ retry:
                if (f <= rg->from && t >= rg->to) { /* Remove entire region */
                        del += rg->to - rg->from;
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           rg->to - rg->from);
+                                                           rg->to - rg->from, true);
                        list_del(&rg->link);
                        kfree(rg);
                        continue;
@@ -688,13 +712,13 @@ retry:
 
                if (f <= rg->from) {    /* Trim beginning of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           t - rg->from);
+                                                           t - rg->from, false);
 
                        del += t - rg->from;
                        rg->from = t;
                } else {                /* Trim end of region */
                        hugetlb_cgroup_uncharge_file_region(resv, rg,
-                                                           rg->to - f);
+                                                           rg->to - f, false);
 
                        del += rg->to - f;
                        rg->to = f;
@@ -3725,21 +3749,32 @@ static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
                return false;
 }
 
+static void
+hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr,
+                    struct page *new_page)
+{
+       __SetPageUptodate(new_page);
+       set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
+       hugepage_add_new_anon_rmap(new_page, vma, addr);
+       hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
+       ClearHPageRestoreReserve(new_page);
+       SetHPageMigratable(new_page);
+}
+
 int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                            struct vm_area_struct *vma)
 {
        pte_t *src_pte, *dst_pte, entry, dst_entry;
        struct page *ptepage;
        unsigned long addr;
-       int cow;
+       bool cow = is_cow_mapping(vma->vm_flags);
        struct hstate *h = hstate_vma(vma);
        unsigned long sz = huge_page_size(h);
+       unsigned long npages = pages_per_huge_page(h);
        struct address_space *mapping = vma->vm_file->f_mapping;
        struct mmu_notifier_range range;
        int ret = 0;
 
-       cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
-
        if (cow) {
                mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
                                        vma->vm_start,
@@ -3784,6 +3819,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
                entry = huge_ptep_get(src_pte);
                dst_entry = huge_ptep_get(dst_pte);
+again:
                if (huge_pte_none(entry) || !huge_pte_none(dst_entry)) {
                        /*
                         * Skip if src entry none.  Also, skip in the
@@ -3807,6 +3843,52 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                        }
                        set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
                } else {
+                       entry = huge_ptep_get(src_pte);
+                       ptepage = pte_page(entry);
+                       get_page(ptepage);
+
+                       /*
+                        * This is a rare case where we see pinned hugetlb
+                        * pages while they're prone to COW.  We need to do the
+                        * COW earlier during fork.
+                        *
+                        * When pre-allocating the page or copying data, we
+                        * need to be without the pgtable locks since we could
+                        * sleep during the process.
+                        */
+                       if (unlikely(page_needs_cow_for_dma(vma, ptepage))) {
+                               pte_t src_pte_old = entry;
+                               struct page *new;
+
+                               spin_unlock(src_ptl);
+                               spin_unlock(dst_ptl);
+                               /* Do not use reserve as it's private owned */
+                               new = alloc_huge_page(vma, addr, 1);
+                               if (IS_ERR(new)) {
+                                       put_page(ptepage);
+                                       ret = PTR_ERR(new);
+                                       break;
+                               }
+                               copy_user_huge_page(new, ptepage, addr, vma,
+                                                   npages);
+                               put_page(ptepage);
+
+                               /* Install the new huge page if src pte stable */
+                               dst_ptl = huge_pte_lock(h, dst, dst_pte);
+                               src_ptl = huge_pte_lockptr(h, src, src_pte);
+                               spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+                               entry = huge_ptep_get(src_pte);
+                               if (!pte_same(src_pte_old, entry)) {
+                                       put_page(new);
+                                       /* dst_entry won't change as in child */
+                                       goto again;
+                               }
+                               hugetlb_install_page(vma, dst_pte, addr, new);
+                               spin_unlock(src_ptl);
+                               spin_unlock(dst_ptl);
+                               continue;
+                       }
+
                        if (cow) {
                                /*
                                 * No need to notify as we are downgrading page
@@ -3817,12 +3899,10 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                                 */
                                huge_ptep_set_wrprotect(src, addr, src_pte);
                        }
-                       entry = huge_ptep_get(src_pte);
-                       ptepage = pte_page(entry);
-                       get_page(ptepage);
+
                        page_dup_rmap(ptepage, true);
                        set_huge_pte_at(dst, addr, dst_pte, entry);
-                       hugetlb_count_add(pages_per_huge_page(h), dst);
+                       hugetlb_count_add(npages, dst);
                }
                spin_unlock(src_ptl);
                spin_unlock(dst_ptl);
@@ -5128,6 +5208,10 @@ bool hugetlb_reserve_pages(struct inode *inode,
                         */
                        long rsv_adjust;
 
+                       /*
+                        * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+                        * reference to h_cg->css. See comment below for detail.
+                        */
                        hugetlb_cgroup_uncharge_cgroup_rsvd(
                                hstate_index(h),
                                (chg - add) * pages_per_huge_page(h), h_cg);
@@ -5135,6 +5219,14 @@ bool hugetlb_reserve_pages(struct inode *inode,
                        rsv_adjust = hugepage_subpool_put_pages(spool,
                                                                chg - add);
                        hugetlb_acct_memory(h, -rsv_adjust);
+               } else if (h_cg) {
+                       /*
+                        * The file_regions will hold their own reference to
+                        * h_cg->css. So we should release the reference held
+                        * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+                        * done.
+                        */
+                       hugetlb_cgroup_put_rsvd_cgroup(h_cg);
                }
        }
        return true;