Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / mm / hugetlb.c
index 87e11d8..3edb759 100644 (file)
@@ -567,13 +567,13 @@ retry:
  * appear as a "reserved" entry instead of simply dangling with incorrect
  * counts.
  */
-void hugetlb_fix_reserve_counts(struct inode *inode, bool restore_reserve)
+void hugetlb_fix_reserve_counts(struct inode *inode)
 {
        struct hugepage_subpool *spool = subpool_inode(inode);
        long rsv_adjust;
 
        rsv_adjust = hugepage_subpool_get_pages(spool, 1);
-       if (restore_reserve && rsv_adjust) {
+       if (rsv_adjust) {
                struct hstate *h = hstate_inode(inode);
 
                hugetlb_acct_memory(h, 1);
@@ -1022,7 +1022,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
                ((node = hstate_next_node_to_free(hs, mask)) || 1);     \
                nr_nodes--)
 
-#if (defined(CONFIG_X86_64) || defined(CONFIG_S390)) && \
+#if defined(CONFIG_ARCH_HAS_GIGANTIC_PAGE) && \
        ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || \
        defined(CONFIG_CMA))
 static void destroy_compound_gigantic_page(struct page *page,
@@ -1437,38 +1437,61 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
 
 /*
  * Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages.
+ * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
+ * number of free hugepages would be reduced below the number of reserved
+ * hugepages.
  */
-static void dissolve_free_huge_page(struct page *page)
+static int dissolve_free_huge_page(struct page *page)
 {
+       int rc = 0;
+
        spin_lock(&hugetlb_lock);
        if (PageHuge(page) && !page_count(page)) {
-               struct hstate *h = page_hstate(page);
-               int nid = page_to_nid(page);
-               list_del(&page->lru);
+               struct page *head = compound_head(page);
+               struct hstate *h = page_hstate(head);
+               int nid = page_to_nid(head);
+               if (h->free_huge_pages - h->resv_huge_pages == 0) {
+                       rc = -EBUSY;
+                       goto out;
+               }
+               list_del(&head->lru);
                h->free_huge_pages--;
                h->free_huge_pages_node[nid]--;
                h->max_huge_pages--;
-               update_and_free_page(h, page);
+               update_and_free_page(h, head);
        }
+out:
        spin_unlock(&hugetlb_lock);
+       return rc;
 }
 
 /*
  * Dissolve free hugepages in a given pfn range. Used by memory hotplug to
  * make specified memory blocks removable from the system.
- * Note that start_pfn should aligned with (minimum) hugepage size.
+ * Note that this will dissolve a free gigantic hugepage completely, if any
+ * part of it lies within the given range.
+ * Also note that if dissolve_free_huge_page() returns with an error, all
+ * free hugepages that were dissolved before that error are lost.
  */
-void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
+int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn)
 {
        unsigned long pfn;
+       struct page *page;
+       int rc = 0;
 
        if (!hugepages_supported())
-               return;
+               return rc;
 
-       VM_BUG_ON(!IS_ALIGNED(start_pfn, 1 << minimum_order));
-       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order)
-               dissolve_free_huge_page(pfn_to_page(pfn));
+       for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
+               page = pfn_to_page(pfn);
+               if (PageHuge(page) && !page_count(page)) {
+                       rc = dissolve_free_huge_page(page);
+                       if (rc)
+                               break;
+               }
+       }
+
+       return rc;
 }
 
 /*
@@ -1803,11 +1826,17 @@ static void return_unused_surplus_pages(struct hstate *h,
  * is not the case is if a reserve map was changed between calls.  It
  * is the responsibility of the caller to notice the difference and
  * take appropriate action.
+ *
+ * vma_add_reservation is used in error paths where a reservation must
+ * be restored when a newly allocated huge page must be freed.  It is
+ * to be called after calling vma_needs_reservation to determine if a
+ * reservation exists.
  */
 enum vma_resv_mode {
        VMA_NEEDS_RESV,
        VMA_COMMIT_RESV,
        VMA_END_RESV,
+       VMA_ADD_RESV,
 };
 static long __vma_reservation_common(struct hstate *h,
                                struct vm_area_struct *vma, unsigned long addr,
@@ -1833,6 +1862,14 @@ static long __vma_reservation_common(struct hstate *h,
                region_abort(resv, idx, idx + 1);
                ret = 0;
                break;
+       case VMA_ADD_RESV:
+               if (vma->vm_flags & VM_MAYSHARE)
+                       ret = region_add(resv, idx, idx + 1);
+               else {
+                       region_abort(resv, idx, idx + 1);
+                       ret = region_del(resv, idx, idx + 1);
+               }
+               break;
        default:
                BUG();
        }
@@ -1880,6 +1917,56 @@ static void vma_end_reservation(struct hstate *h,
        (void)__vma_reservation_common(h, vma, addr, VMA_END_RESV);
 }
 
+static long vma_add_reservation(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long addr)
+{
+       return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV);
+}
+
+/*
+ * This routine is called to restore a reservation on error paths.  In the
+ * specific error paths, a huge page was allocated (via alloc_huge_page)
+ * and is about to be freed.  If a reservation for the page existed,
+ * alloc_huge_page would have consumed the reservation and set PagePrivate
+ * in the newly allocated page.  When the page is freed via free_huge_page,
+ * the global reservation count will be incremented if PagePrivate is set.
+ * However, free_huge_page can not adjust the reserve map.  Adjust the
+ * reserve map here to be consistent with global reserve count adjustments
+ * to be made by free_huge_page.
+ */
+static void restore_reserve_on_error(struct hstate *h,
+                       struct vm_area_struct *vma, unsigned long address,
+                       struct page *page)
+{
+       if (unlikely(PagePrivate(page))) {
+               long rc = vma_needs_reservation(h, vma, address);
+
+               if (unlikely(rc < 0)) {
+                       /*
+                        * Rare out of memory condition in reserve map
+                        * manipulation.  Clear PagePrivate so that
+                        * global reserve count will not be incremented
+                        * by free_huge_page.  This will make it appear
+                        * as though the reservation for this page was
+                        * consumed.  This may prevent the task from
+                        * faulting in the page at a later time.  This
+                        * is better than inconsistent global huge page
+                        * accounting of reserve counts.
+                        */
+                       ClearPagePrivate(page);
+               } else if (rc) {
+                       rc = vma_add_reservation(h, vma, address);
+                       if (unlikely(rc < 0))
+                               /*
+                                * See above comment about rare out of
+                                * memory condition.
+                                */
+                               ClearPagePrivate(page);
+               } else
+                       vma_end_reservation(h, vma, address);
+       }
+}
+
 struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr, int avoid_reserve)
 {
@@ -3199,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
        BUG_ON(start & ~huge_page_mask(h));
        BUG_ON(end & ~huge_page_mask(h));
 
+       /*
+        * This is a hugetlb vma, all the pte entries should point
+        * to huge page.
+        */
+       tlb_remove_check_page_size_change(tlb, sz);
        tlb_start_vma(tlb, vma);
        mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
        address = start;
@@ -3249,7 +3341,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                }
 
                pte = huge_ptep_get_and_clear(mm, address, ptep);
-               tlb_remove_tlb_entry(tlb, ptep, address);
+               tlb_remove_huge_tlb_entry(h, tlb, ptep, address);
                if (huge_pte_dirty(pte))
                        set_page_dirty(page);
 
@@ -3363,15 +3455,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
  * Keep the pte_same checks anyway to make transition from the mutex easier.
  */
 static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
-                       unsigned long address, pte_t *ptep, pte_t pte,
-                       struct page *pagecache_page, spinlock_t *ptl)
+                      unsigned long address, pte_t *ptep,
+                      struct page *pagecache_page, spinlock_t *ptl)
 {
+       pte_t pte;
        struct hstate *h = hstate_vma(vma);
        struct page *old_page, *new_page;
        int ret = 0, outside_reserve = 0;
        unsigned long mmun_start;       /* For mmu_notifiers */
        unsigned long mmun_end;         /* For mmu_notifiers */
 
+       pte = huge_ptep_get(ptep);
        old_page = pte_page(pte);
 
 retry_avoidcopy:
@@ -3475,6 +3569,7 @@ retry_avoidcopy:
        spin_unlock(ptl);
        mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 out_release_all:
+       restore_reserve_on_error(h, vma, address, new_page);
        put_page(new_page);
 out_release_old:
        put_page(old_page);
@@ -3623,8 +3718,7 @@ retry:
                vma_end_reservation(h, vma, address);
        }
 
-       ptl = huge_pte_lockptr(h, mm, ptep);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(h, mm, ptep);
        size = i_size_read(mapping->host) >> huge_page_shift(h);
        if (idx >= size)
                goto backout;
@@ -3645,7 +3739,7 @@ retry:
        hugetlb_count_add(pages_per_huge_page(h), mm);
        if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
-               ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+               ret = hugetlb_cow(mm, vma, address, ptep, page, ptl);
        }
 
        spin_unlock(ptl);
@@ -3657,6 +3751,7 @@ backout:
        spin_unlock(ptl);
 backout_unlocked:
        unlock_page(page);
+       restore_reserve_on_error(h, vma, address, page);
        put_page(page);
        goto out;
 }
@@ -3799,8 +3894,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 
        if (flags & FAULT_FLAG_WRITE) {
                if (!huge_pte_write(entry)) {
-                       ret = hugetlb_cow(mm, vma, address, ptep, entry,
-                                       pagecache_page, ptl);
+                       ret = hugetlb_cow(mm, vma, address, ptep,
+                                         pagecache_page, ptl);
                        goto out_put_page;
                }
                entry = huge_pte_mkdirty(entry);
@@ -4241,8 +4336,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
        if (!spte)
                goto out;
 
-       ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
-       spin_lock(ptl);
+       ptl = huge_pte_lock(hstate_vma(vma), mm, spte);
        if (pud_none(*pud)) {
                pud_populate(mm, pud,
                                (pmd_t *)((unsigned long)spte & PAGE_MASK));