fs/hugetlbfs/inode.c: fix bugs in hugetlb_vmtruncate_list()

[linux-2.6-microblaze.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 9d666df..63358d9 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -43,6 +43,7 @@
  #include <linux/vmalloc.h>
  #include <linux/vmstat.h>
  #include <linux/mempolicy.h>
+#include <linux/memremap.h>
  #include <linux/stop_machine.h>
  #include <linux/sort.h>
  #include <linux/pfn.h>
@@ -114,13 +115,6 @@ static DEFINE_SPINLOCK(managed_page_count_lock);
  unsigned long totalram_pages __read_mostly;
  unsigned long totalreserve_pages __read_mostly;
  unsigned long totalcma_pages __read_mostly;
-/*
- * When calculating the number of globally allowed dirty pages, there
- * is a certain number of per-zone reserves that should not be
- * considered dirtyable memory.  This is the sum of those reserves
- * over all existing zones that contribute dirtyable memory.
- */
-unsigned long dirty_balance_reserve __read_mostly;
  
  int percpu_pagelist_fraction;
  gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
@@ -229,13 +223,15 @@ static char * const zone_names[MAX_NR_ZONES] = {
  #endif
  };
  
-static void free_compound_page(struct page *page);
  compound_page_dtor * const compound_page_dtors[] = {
         NULL,
         free_compound_page,
  #ifdef CONFIG_HUGETLB_PAGE
         free_huge_page,
  #endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       free_transhuge_page,
+#endif
  };
  
  int min_free_kbytes = 1024;
@@ -457,7 +453,7 @@ out:
   * This usage means that zero-order pages may not be compound.
   */
  
-static void free_compound_page(struct page *page)
+void free_compound_page(struct page *page)
  {
         __free_pages_ok(page, compound_order(page));
  }
@@ -473,8 +469,10 @@ void prep_compound_page(struct page *page, unsigned int order)
         for (i = 1; i < nr_pages; i++) {
                 struct page *p = page + i;
                 set_page_count(p, 0);
+               p->mapping = TAIL_MAPPING;
                 set_compound_head(p, page);
         }
+       atomic_set(compound_mapcount_ptr(page), -1);
  }
  
  #ifdef CONFIG_DEBUG_PAGEALLOC
@@ -739,7 +737,7 @@ static inline int free_pages_check(struct page *page)
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
  
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
@@ -812,7 +810,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
                 do {
                         int mt; /* migratetype of the to-be-freed page */
  
-                       page = list_entry(list->prev, struct page, lru);
+                       page = list_last_entry(list, struct page, lru);
                         /* must delete as __free_one_page list manipulates */
                         list_del(&page->lru);
  
@@ -863,6 +861,27 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
                 ret = 0;
                 goto out;
         }
+       switch (page - head_page) {
+       case 1:
+               /* the first tail page: ->mapping is compound_mapcount() */
+               if (unlikely(compound_mapcount(page))) {
+                       bad_page(page, "nonzero compound_mapcount", 0);
+                       goto out;
+               }
+               break;
+       case 2:
+               /*
+                * the second tail page: ->mapping is
+                * page_deferred_list().next -- ignore value.
+                */
+               break;
+       default:
+               if (page->mapping != TAIL_MAPPING) {
+                       bad_page(page, "corrupted mapping in tail page", 0);
+                       goto out;
+               }
+               break;
+       }
         if (unlikely(!PageTail(page))) {
                 bad_page(page, "PageTail not set", 0);
                 goto out;
@@ -873,6 +892,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
         }
         ret = 0;
  out:
+       page->mapping = NULL;
         clear_compound_head(page);
         return ret;
  }
@@ -1336,7 +1356,7 @@ static inline int check_new_page(struct page *page)
         const char *bad_reason = NULL;
         unsigned long bad_flags = 0;
  
-       if (unlikely(page_mapcount(page)))
+       if (unlikely(atomic_read(&page->_mapcount) != -1))
                 bad_reason = "nonzero mapcount";
         if (unlikely(page->mapping != NULL))
                 bad_reason = "non-NULL mapping";
@@ -1417,11 +1437,10 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
         /* Find a page of the appropriate size in the preferred list */
         for (current_order = order; current_order < MAX_ORDER; ++current_order) {
                 area = &(zone->free_area[current_order]);
-               if (list_empty(&area->free_list[migratetype]))
-                       continue;
-
-               page = list_entry(area->free_list[migratetype].next,
+               page = list_first_entry_or_null(&area->free_list[migratetype],
                                                         struct page, lru);
+               if (!page)
+                       continue;
                 list_del(&page->lru);
                 rmv_page_order(page);
                 area->nr_free--;
@@ -1700,12 +1719,12 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac)
                 for (order = 0; order < MAX_ORDER; order++) {
                         struct free_area *area = &(zone->free_area[order]);
  
-                       if (list_empty(&area->free_list[MIGRATE_HIGHATOMIC]))
+                       page = list_first_entry_or_null(
+                                       &area->free_list[MIGRATE_HIGHATOMIC],
+                                       struct page, lru);
+                       if (!page)
                                 continue;
  
-                       page = list_entry(area->free_list[MIGRATE_HIGHATOMIC].next,
-                                               struct page, lru);
-
                         /*
                          * It should never happen but changes to locking could
                          * inadvertently allow a per-cpu drain to add pages
@@ -1753,7 +1772,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
                 if (fallback_mt == -1)
                         continue;
  
-               page = list_entry(area->free_list[fallback_mt].next,
+               page = list_first_entry(&area->free_list[fallback_mt],
                                                 struct page, lru);
                 if (can_steal)
                         steal_suitable_fallback(zone, page, start_migratetype);
@@ -1788,7 +1807,7 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
   * Call me with the zone->lock already held.
   */
  static struct page *__rmqueue(struct zone *zone, unsigned int order,
-                               int migratetype, gfp_t gfp_flags)
+                               int migratetype)
  {
         struct page *page;
  
@@ -1818,7 +1837,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
  
         spin_lock(&zone->lock);
         for (i = 0; i < count; ++i) {
-               struct page *page = __rmqueue(zone, order, migratetype, 0);
+               struct page *page = __rmqueue(zone, order, migratetype);
                 if (unlikely(page == NULL))
                         break;
  
@@ -1988,7 +2007,7 @@ void mark_free_pages(struct zone *zone)
         unsigned long pfn, max_zone_pfn;
         unsigned long flags;
         unsigned int order, t;
-       struct list_head *curr;
+       struct page *page;
  
         if (zone_is_empty(zone))
                 return;
@@ -1998,17 +2017,17 @@ void mark_free_pages(struct zone *zone)
         max_zone_pfn = zone_end_pfn(zone);
         for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
                 if (pfn_valid(pfn)) {
-                       struct page *page = pfn_to_page(pfn);
-
+                       page = pfn_to_page(pfn);
                         if (!swsusp_page_is_forbidden(page))
                                 swsusp_unset_page_free(page);
                 }
  
         for_each_migratetype_order(order, t) {
-               list_for_each(curr, &zone->free_area[order].free_list[t]) {
+               list_for_each_entry(page,
+                               &zone->free_area[order].free_list[t], lru) {
                         unsigned long i;
  
-                       pfn = page_to_pfn(list_entry(curr, struct page, lru));
+                       pfn = page_to_pfn(page);
                         for (i = 0; i < (1UL << order); i++)
                                 swsusp_set_page_free(pfn_to_page(pfn + i));
                 }
@@ -2212,9 +2231,9 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                 }
  
                 if (cold)
-                       page = list_entry(list->prev, struct page, lru);
+                       page = list_last_entry(list, struct page, lru);
                 else
-                       page = list_entry(list->next, struct page, lru);
+                       page = list_first_entry(list, struct page, lru);
  
                 list_del(&page->lru);
                 pcp->count--;
@@ -2241,7 +2260,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
                                 trace_mm_page_alloc_zone_locked(page, order, migratetype);
                 }
                 if (!page)
-                       page = __rmqueue(zone, order, migratetype, gfp_flags);
+                       page = __rmqueue(zone, order, migratetype);
                 spin_unlock(&zone->lock);
                 if (!page)
                         goto failed;
@@ -2740,8 +2759,21 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
                         goto out;
         }
         /* Exhausted what can be done so it's blamo time */
-       if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL))
+       if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
                 *did_some_progress = 1;
+
+               if (gfp_mask & __GFP_NOFAIL) {
+                       page = get_page_from_freelist(gfp_mask, order,
+                                       ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+                       /*
+                        * fallback to ignore cpuset restriction if our nodes
+                        * are depleted
+                        */
+                       if (!page)
+                               page = get_page_from_freelist(gfp_mask, order,
+                                       ALLOC_NO_WATERMARKS, ac);
+               }
+       }
  out:
         mutex_unlock(&oom_lock);
         return page;
@@ -2876,28 +2908,6 @@ retry:
         return page;
  }
  
-/*
- * This is called in the allocator slow-path if the allocation request is of
- * sufficient urgency to ignore watermarks and take other desperate measures
- */
-static inline struct page *
-__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
-                               const struct alloc_context *ac)
-{
-       struct page *page;
-
-       do {
-               page = get_page_from_freelist(gfp_mask, order,
-                                               ALLOC_NO_WATERMARKS, ac);
-
-               if (!page && gfp_mask & __GFP_NOFAIL)
-                       wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
-                                                                       HZ/50);
-       } while (!page && (gfp_mask & __GFP_NOFAIL));
-
-       return page;
-}
-
  static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
  {
         struct zoneref *z;
@@ -3042,28 +3052,36 @@ retry:
                  * allocations are system rather than user orientated
                  */
                 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
-
-               page = __alloc_pages_high_priority(gfp_mask, order, ac);
-
-               if (page) {
+               page = get_page_from_freelist(gfp_mask, order,
+                                               ALLOC_NO_WATERMARKS, ac);
+               if (page)
                         goto got_pg;
-               }
         }
  
         /* Caller is not willing to reclaim, we can't balance anything */
         if (!can_direct_reclaim) {
                 /*
-                * All existing users of the deprecated __GFP_NOFAIL are
-                * blockable, so warn of any new users that actually allow this
-                * type of allocation to fail.
+                * All existing users of the __GFP_NOFAIL are blockable, so warn
+                * of any new users that actually allow this type of allocation
+                * to fail.
                  */
                 WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL);
                 goto nopage;
         }
  
         /* Avoid recursion of direct reclaim */
-       if (current->flags & PF_MEMALLOC)
+       if (current->flags & PF_MEMALLOC) {
+               /*
+                * __GFP_NOFAIL request from this context is rather bizarre
+                * because we cannot reclaim anything and only can loop waiting
+                * for somebody to do a work for us.
+                */
+               if (WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) {
+                       cond_resched();
+                       goto retry;
+               }
                 goto nopage;
+       }
  
         /* Avoid allocations with no watermarks from looping endlessly */
         if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
@@ -3402,7 +3420,8 @@ EXPORT_SYMBOL(__free_page_frag);
  
  /*
   * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
+ * of the current memory cgroup if __GFP_ACCOUNT is set, other than that it is
+ * equivalent to alloc_pages.
   *
   * It should be used when the caller would like to use kmalloc, but since the
   * allocation is large, it has to fall back to the page allocator.
@@ -4147,8 +4166,7 @@ static void set_zonelist_order(void)
  
  static void build_zonelists(pg_data_t *pgdat)
  {
-       int j, node, load;
-       enum zone_type i;
+       int i, node, load;
         nodemask_t used_mask;
         int local_node, prev_node;
         struct zonelist *zonelist;
@@ -4168,7 +4186,7 @@ static void build_zonelists(pg_data_t *pgdat)
         nodes_clear(used_mask);
  
         memset(node_order, 0, sizeof(node_order));
-       j = 0;
+       i = 0;
  
         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                 /*
@@ -4185,12 +4203,12 @@ static void build_zonelists(pg_data_t *pgdat)
                 if (order == ZONELIST_ORDER_NODE)
                         build_zonelists_in_node_order(pgdat, node);
                 else
-                       node_order[j++] = node; /* remember order */
+                       node_order[i++] = node; /* remember order */
         }
  
         if (order == ZONELIST_ORDER_ZONE) {
                 /* calculate node order -- i.e., DMA last! */
-               build_zonelists_in_zone_order(pgdat, j);
+               build_zonelists_in_zone_order(pgdat, i);
         }
  
         build_thisnode_zonelists(pgdat);
@@ -4468,16 +4486,22 @@ static inline unsigned long wait_table_bits(unsigned long size)
  void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                 unsigned long start_pfn, enum memmap_context context)
  {
-       pg_data_t *pgdat = NODE_DATA(nid);
+       struct vmem_altmap *altmap = to_vmem_altmap(__pfn_to_phys(start_pfn));
         unsigned long end_pfn = start_pfn + size;
+       pg_data_t *pgdat = NODE_DATA(nid);
         unsigned long pfn;
-       struct zone *z;
         unsigned long nr_initialised = 0;
  
         if (highest_memmap_pfn < end_pfn - 1)
                 highest_memmap_pfn = end_pfn - 1;
  
-       z = &pgdat->node_zones[zone];
+       /*
+        * Honor reservation requested by the driver for this ZONE_DEVICE
+        * memory
+        */
+       if (altmap && start_pfn == altmap->base_pfn)
+               start_pfn += altmap->reserve;
+
         for (pfn = start_pfn; pfn < end_pfn; pfn++) {
                 /*
                  * There can be holes in boot-time mem_map[]s
@@ -5956,20 +5980,12 @@ static void calculate_totalreserve_pages(void)
  
                         if (max > zone->managed_pages)
                                 max = zone->managed_pages;
+
+                       zone->totalreserve_pages = max;
+
                         reserve_pages += max;
-                       /*
-                        * Lowmem reserves are not available to
-                        * GFP_HIGHUSER page cache allocations and
-                        * kswapd tries to balance zones to their high
-                        * watermark.  As a result, neither should be
-                        * regarded as dirtyable memory, to prevent a
-                        * situation where reclaim has to clean pages
-                        * in order to balance the zones.
-                        */
-                       zone->dirty_balance_reserve = max;
                 }
         }
-       dirty_balance_reserve = reserve_pages;
         totalreserve_pages = reserve_pages;
  }
  
@@ -6724,8 +6740,12 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         if (ret)
                 return ret;
  
+       /*
+        * In case of -EBUSY, we'd like to know which page causes problem.
+        * So, just fall through. We will check it in test_pages_isolated().
+        */
         ret = __alloc_contig_migrate_range(&cc, start, end);
-       if (ret)
+       if (ret && ret != -EBUSY)
                 goto done;
  
         /*
@@ -6752,12 +6772,25 @@ int alloc_contig_range(unsigned long start, unsigned long end,
         outer_start = start;
         while (!PageBuddy(pfn_to_page(outer_start))) {
                 if (++order >= MAX_ORDER) {
-                       ret = -EBUSY;
-                       goto done;
+                       outer_start = start;
+                       break;
                 }
                 outer_start &= ~0UL << order;
         }
  
+       if (outer_start != start) {
+               order = page_order(pfn_to_page(outer_start));
+
+               /*
+                * outer_start page could be small order buddy page and
+                * it doesn't include start page. Adjust outer_start
+                * in this case to report failed page properly
+                * on tracepoint in test_pages_isolated()
+                */
+               if (outer_start + (1UL << order) <= start)
+                       outer_start = start;
+       }
+
         /* Make sure the range is really isolated. */
         if (test_pages_isolated(outer_start, end, false)) {
                 pr_info("%s: [%lx, %lx) PFNs busy\n",