Merge branch 'irq-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel...

[linux-2.6-microblaze.git] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 9327a94..77e4d3c 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1190,7 +1190,7 @@ static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone,
  }
  
  #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
-static void init_reserved_page(unsigned long pfn)
+static void __meminit init_reserved_page(unsigned long pfn)
  {
         pg_data_t *pgdat;
         int nid, zid;
@@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
  static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
  {
  #ifdef CONFIG_NUMA
-       enum zone_stat_item local_stat = NUMA_LOCAL;
+       enum numa_stat_item local_stat = NUMA_LOCAL;
  
         if (z->node != numa_node_id())
                 local_stat = NUMA_OTHER;
  
         if (z->node == preferred_zone->node)
-               __inc_zone_state(z, NUMA_HIT);
+               __inc_numa_state(z, NUMA_HIT);
         else {
-               __inc_zone_state(z, NUMA_MISS);
-               __inc_zone_state(preferred_zone, NUMA_FOREIGN);
+               __inc_numa_state(z, NUMA_MISS);
+               __inc_numa_state(preferred_zone, NUMA_FOREIGN);
         }
-       __inc_zone_state(z, local_stat);
+       __inc_numa_state(z, local_stat);
  #endif
  }
  
@@ -2951,7 +2951,7 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
  {
         long min = mark;
         int o;
-       const bool alloc_harder = (alloc_flags & ALLOC_HARDER);
+       const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
  
         /* free_pages may go negative - that's OK */
         free_pages -= (1 << order) - 1;
@@ -2964,10 +2964,21 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
          * the high-atomic reserves. This will over-estimate the size of the
          * atomic reserve but it avoids a search.
          */
-       if (likely(!alloc_harder))
+       if (likely(!alloc_harder)) {
                 free_pages -= z->nr_reserved_highatomic;
-       else
-               min -= min / 4;
+       } else {
+               /*
+                * OOM victims can try even harder than normal ALLOC_HARDER
+                * users on the grounds that it's definitely going to be in
+                * the exit path shortly and free memory. Any allocation it
+                * makes during the free path will be small and short-lived.
+                */
+               if (alloc_flags & ALLOC_OOM)
+                       min -= min / 2;
+               else
+                       min -= min / 4;
+       }
+
  
  #ifdef CONFIG_CMA
         /* If allocation can't use CMA areas don't use free CMA pages */
@@ -3205,7 +3216,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
          * of allowed nodes.
          */
         if (!(gfp_mask & __GFP_NOMEMALLOC))
-               if (test_thread_flag(TIF_MEMDIE) ||
+               if (tsk_is_oom_victim(current) ||
                     (current->flags & (PF_MEMALLOC | PF_EXITING)))
                         filter &= ~SHOW_MEM_FILTER_NODES;
         if (in_interrupt() || !(gfp_mask & __GFP_DIRECT_RECLAIM))
@@ -3668,21 +3679,46 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
         return alloc_flags;
  }
  
-bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+static bool oom_reserves_allowed(struct task_struct *tsk)
  {
-       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+       if (!tsk_is_oom_victim(tsk))
                 return false;
  
+       /*
+        * !MMU doesn't have oom reaper so give access to memory reserves
+        * only to the thread with TIF_MEMDIE set
+        */
+       if (!IS_ENABLED(CONFIG_MMU) && !test_thread_flag(TIF_MEMDIE))
+               return false;
+
+       return true;
+}
+
+/*
+ * Distinguish requests which really need access to full memory
+ * reserves from oom victims which can live with a portion of it
+ */
+static inline int __gfp_pfmemalloc_flags(gfp_t gfp_mask)
+{
+       if (unlikely(gfp_mask & __GFP_NOMEMALLOC))
+               return 0;
         if (gfp_mask & __GFP_MEMALLOC)
-               return true;
+               return ALLOC_NO_WATERMARKS;
         if (in_serving_softirq() && (current->flags & PF_MEMALLOC))
-               return true;
-       if (!in_interrupt() &&
-                       ((current->flags & PF_MEMALLOC) ||
-                        unlikely(test_thread_flag(TIF_MEMDIE))))
-               return true;
+               return ALLOC_NO_WATERMARKS;
+       if (!in_interrupt()) {
+               if (current->flags & PF_MEMALLOC)
+                       return ALLOC_NO_WATERMARKS;
+               else if (oom_reserves_allowed(current))
+                       return ALLOC_OOM;
+       }
  
-       return false;
+       return 0;
+}
+
+bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
+{
+       return !!__gfp_pfmemalloc_flags(gfp_mask);
  }
  
  /*
@@ -3835,6 +3871,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
         unsigned long alloc_start = jiffies;
         unsigned int stall_timeout = 10 * HZ;
         unsigned int cpuset_mems_cookie;
+       int reserve_flags;
  
         /*
          * In the slowpath, we sanity check order to avoid ever trying to
@@ -3940,15 +3977,16 @@ retry:
         if (gfp_mask & __GFP_KSWAPD_RECLAIM)
                 wake_all_kswapds(order, ac);
  
-       if (gfp_pfmemalloc_allowed(gfp_mask))
-               alloc_flags = ALLOC_NO_WATERMARKS;
+       reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
+       if (reserve_flags)
+               alloc_flags = reserve_flags;
  
         /*
          * Reset the zonelist iterators if memory policies can be ignored.
          * These allocations are high priority and system rather than user
          * orientated.
          */
-       if (!(alloc_flags & ALLOC_CPUSET) || (alloc_flags & ALLOC_NO_WATERMARKS)) {
+       if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
                 ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
                 ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
                                         ac->high_zoneidx, ac->nodemask);
@@ -4025,8 +4063,8 @@ retry:
                 goto got_pg;
  
         /* Avoid allocations with no watermarks from looping endlessly */
-       if (test_thread_flag(TIF_MEMDIE) &&
-           (alloc_flags == ALLOC_NO_WATERMARKS ||
+       if (tsk_is_oom_victim(current) &&
+           (alloc_flags == ALLOC_OOM ||
              (gfp_mask & __GFP_NOMEMALLOC)))
                 goto nopage;
  
@@ -4145,10 +4183,11 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
  {
         struct page *page;
         unsigned int alloc_flags = ALLOC_WMARK_LOW;
-       gfp_t alloc_mask = gfp_mask; /* The gfp_t that was actually used for allocation */
+       gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
         struct alloc_context ac = { };
  
         gfp_mask &= gfp_allowed_mask;
+       alloc_mask = gfp_mask;
         if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags))
                 return NULL;
  
@@ -4509,7 +4548,7 @@ long si_mem_available(void)
          * Estimate the amount of memory available for userspace allocations,
          * without causing swapping.
          */
-       available = global_page_state(NR_FREE_PAGES) - totalreserve_pages;
+       available = global_zone_page_state(NR_FREE_PAGES) - totalreserve_pages;
  
         /*
          * Not all the page cache can be freed, otherwise the system will
@@ -4538,7 +4577,7 @@ void si_meminfo(struct sysinfo *val)
  {
         val->totalram = totalram_pages;
         val->sharedram = global_node_page_state(NR_SHMEM);
-       val->freeram = global_page_state(NR_FREE_PAGES);
+       val->freeram = global_zone_page_state(NR_FREE_PAGES);
         val->bufferram = nr_blockdev_pages();
         val->totalhigh = totalhigh_pages;
         val->freehigh = nr_free_highpages();
@@ -4673,11 +4712,11 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
                 global_node_page_state(NR_SLAB_UNRECLAIMABLE),
                 global_node_page_state(NR_FILE_MAPPED),
                 global_node_page_state(NR_SHMEM),
-               global_page_state(NR_PAGETABLE),
-               global_page_state(NR_BOUNCE),
-               global_page_state(NR_FREE_PAGES),
+               global_zone_page_state(NR_PAGETABLE),
+               global_zone_page_state(NR_BOUNCE),
+               global_zone_page_state(NR_FREE_PAGES),
                 free_pcp,
-               global_page_state(NR_FREE_CMA_PAGES));
+               global_zone_page_state(NR_FREE_CMA_PAGES));
  
         for_each_online_pgdat(pgdat) {
                 if (show_mem_node_skip(filter, pgdat->node_id, nodemask))
@@ -4839,18 +4878,17 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
   *
   * Add all populated zones of a node to the zonelist.
   */
-static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
-                               int nr_zones)
+static int build_zonerefs_node(pg_data_t *pgdat, struct zoneref *zonerefs)
  {
         struct zone *zone;
         enum zone_type zone_type = MAX_NR_ZONES;
+       int nr_zones = 0;
  
         do {
                 zone_type--;
                 zone = pgdat->node_zones + zone_type;
                 if (managed_zone(zone)) {
-                       zoneref_set_zone(zone,
-                               &zonelist->_zonerefs[nr_zones++]);
+                       zoneref_set_zone(zone, &zonerefs[nr_zones++]);
                         check_highest_zone(zone_type);
                 }
         } while (zone_type);
@@ -4858,52 +4896,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
         return nr_zones;
  }
  
-
-/*
- *  zonelist_order:
- *  0 = automatic detection of better ordering.
- *  1 = order by ([node] distance, -zonetype)
- *  2 = order by (-zonetype, [node] distance)
- *
- *  If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
- *  the same zonelist. So only NUMA can configure this param.
- */
-#define ZONELIST_ORDER_DEFAULT  0
-#define ZONELIST_ORDER_NODE     1
-#define ZONELIST_ORDER_ZONE     2
-
-/* zonelist order in the kernel.
- * set_zonelist_order() will set this to NODE or ZONE.
- */
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
-
-
  #ifdef CONFIG_NUMA
-/* The value user specified ....changed by config */
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-/* string for sysctl */
-#define NUMA_ZONELIST_ORDER_LEN        16
-char numa_zonelist_order[16] = "default";
-
-/*
- * interface for configure zonelist ordering.
- * command line option "numa_zonelist_order"
- *     = "[dD]efault   - default, automatic configuration.
- *     = "[nN]ode      - order by node locality, then by zone within node
- *     = "[zZ]one      - order by zone, then by locality within zone
- */
  
  static int __parse_numa_zonelist_order(char *s)
  {
-       if (*s == 'd' || *s == 'D') {
-               user_zonelist_order = ZONELIST_ORDER_DEFAULT;
-       } else if (*s == 'n' || *s == 'N') {
-               user_zonelist_order = ZONELIST_ORDER_NODE;
-       } else if (*s == 'z' || *s == 'Z') {
-               user_zonelist_order = ZONELIST_ORDER_ZONE;
-       } else {
-               pr_warn("Ignoring invalid numa_zonelist_order value:  %s\n", s);
+       /*
+        * We used to support different zonlists modes but they turned
+        * out to be just not useful. Let's keep the warning in place
+        * if somebody still use the cmd line parameter so that we do
+        * not fail it silently
+        */
+       if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
+               pr_warn("Ignoring unsupported numa_zonelist_order value:  %s\n", s);
                 return -EINVAL;
         }
         return 0;
@@ -4911,19 +4915,15 @@ static int __parse_numa_zonelist_order(char *s)
  
  static __init int setup_numa_zonelist_order(char *s)
  {
-       int ret;
-
         if (!s)
                 return 0;
  
-       ret = __parse_numa_zonelist_order(s);
-       if (ret == 0)
-               strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
-
-       return ret;
+       return __parse_numa_zonelist_order(s);
  }
  early_param("numa_zonelist_order", setup_numa_zonelist_order);
  
+char numa_zonelist_order[] = "Node";
+
  /*
   * sysctl handler for numa_zonelist_order
   */
@@ -4931,42 +4931,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *length,
                 loff_t *ppos)
  {
-       char saved_string[NUMA_ZONELIST_ORDER_LEN];
+       char *str;
         int ret;
-       static DEFINE_MUTEX(zl_order_mutex);
  
-       mutex_lock(&zl_order_mutex);
-       if (write) {
-               if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
-                       ret = -EINVAL;
-                       goto out;
-               }
-               strcpy(saved_string, (char *)table->data);
-       }
-       ret = proc_dostring(table, write, buffer, length, ppos);
-       if (ret)
-               goto out;
-       if (write) {
-               int oldval = user_zonelist_order;
+       if (!write)
+               return proc_dostring(table, write, buffer, length, ppos);
+       str = memdup_user_nul(buffer, 16);
+       if (IS_ERR(str))
+               return PTR_ERR(str);
  
-               ret = __parse_numa_zonelist_order((char *)table->data);
-               if (ret) {
-                       /*
-                        * bogus value.  restore saved string
-                        */
-                       strncpy((char *)table->data, saved_string,
-                               NUMA_ZONELIST_ORDER_LEN);
-                       user_zonelist_order = oldval;
-               } else if (oldval != user_zonelist_order) {
-                       mem_hotplug_begin();
-                       mutex_lock(&zonelists_mutex);
-                       build_all_zonelists(NULL, NULL);
-                       mutex_unlock(&zonelists_mutex);
-                       mem_hotplug_done();
-               }
-       }
-out:
-       mutex_unlock(&zl_order_mutex);
+       ret = __parse_numa_zonelist_order(str);
+       kfree(str);
         return ret;
  }
  
@@ -5040,17 +5015,24 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
   * This results in maximum locality--normal zone overflows into local
   * DMA zone, if any--but risks exhausting DMA zone.
   */
-static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
+static void build_zonelists_in_node_order(pg_data_t *pgdat, int *node_order,
+               unsigned nr_nodes)
  {
-       int j;
-       struct zonelist *zonelist;
+       struct zoneref *zonerefs;
+       int i;
+
+       zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+
+       for (i = 0; i < nr_nodes; i++) {
+               int nr_zones;
  
-       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-       for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
-               ;
-       j = build_zonelists_node(NODE_DATA(node), zonelist, j);
-       zonelist->_zonerefs[j].zone = NULL;
-       zonelist->_zonerefs[j].zone_idx = 0;
+               pg_data_t *node = NODE_DATA(node_order[i]);
+
+               nr_zones = build_zonerefs_node(node, zonerefs);
+               zonerefs += nr_zones;
+       }
+       zonerefs->zone = NULL;
+       zonerefs->zone_idx = 0;
  }
  
  /*
@@ -5058,13 +5040,14 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
   */
  static void build_thisnode_zonelists(pg_data_t *pgdat)
  {
-       int j;
-       struct zonelist *zonelist;
+       struct zoneref *zonerefs;
+       int nr_zones;
  
-       zonelist = &pgdat->node_zonelists[ZONELIST_NOFALLBACK];
-       j = build_zonelists_node(pgdat, zonelist, 0);
-       zonelist->_zonerefs[j].zone = NULL;
-       zonelist->_zonerefs[j].zone_idx = 0;
+       zonerefs = pgdat->node_zonelists[ZONELIST_NOFALLBACK]._zonerefs;
+       nr_zones = build_zonerefs_node(pgdat, zonerefs);
+       zonerefs += nr_zones;
+       zonerefs->zone = NULL;
+       zonerefs->zone_idx = 0;
  }
  
  /*
@@ -5073,79 +5056,13 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
   * exhausted, but results in overflowing to remote node while memory
   * may still exist in local DMA zone.
   */
-static int node_order[MAX_NUMNODES];
-
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
-{
-       int pos, j, node;
-       int zone_type;          /* needs to be signed */
-       struct zone *z;
-       struct zonelist *zonelist;
-
-       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-       pos = 0;
-       for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
-               for (j = 0; j < nr_nodes; j++) {
-                       node = node_order[j];
-                       z = &NODE_DATA(node)->node_zones[zone_type];
-                       if (managed_zone(z)) {
-                               zoneref_set_zone(z,
-                                       &zonelist->_zonerefs[pos++]);
-                               check_highest_zone(zone_type);
-                       }
-               }
-       }
-       zonelist->_zonerefs[pos].zone = NULL;
-       zonelist->_zonerefs[pos].zone_idx = 0;
-}
-
-#if defined(CONFIG_64BIT)
-/*
- * Devices that require DMA32/DMA are relatively rare and do not justify a
- * penalty to every machine in case the specialised case applies. Default
- * to Node-ordering on 64-bit NUMA machines
- */
-static int default_zonelist_order(void)
-{
-       return ZONELIST_ORDER_NODE;
-}
-#else
-/*
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
- * by the kernel. If processes running on node 0 deplete the low memory zone
- * then reclaim will occur more frequency increasing stalls and potentially
- * be easier to OOM if a large percentage of the zone is under writeback or
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
- * Hence, default to zone ordering on 32-bit.
- */
-static int default_zonelist_order(void)
-{
-       return ZONELIST_ORDER_ZONE;
-}
-#endif /* CONFIG_64BIT */
-
-static void set_zonelist_order(void)
-{
-       if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
-               current_zonelist_order = default_zonelist_order();
-       else
-               current_zonelist_order = user_zonelist_order;
-}
  
  static void build_zonelists(pg_data_t *pgdat)
  {
-       int i, node, load;
+       static int node_order[MAX_NUMNODES];
+       int node, load, nr_nodes = 0;
         nodemask_t used_mask;
         int local_node, prev_node;
-       struct zonelist *zonelist;
-       unsigned int order = current_zonelist_order;
-
-       /* initialize zonelists */
-       for (i = 0; i < MAX_ZONELISTS; i++) {
-               zonelist = pgdat->node_zonelists + i;
-               zonelist->_zonerefs[0].zone = NULL;
-               zonelist->_zonerefs[0].zone_idx = 0;
-       }
  
         /* NUMA-aware ordering of nodes */
         local_node = pgdat->node_id;
@@ -5154,8 +5071,6 @@ static void build_zonelists(pg_data_t *pgdat)
         nodes_clear(used_mask);
  
         memset(node_order, 0, sizeof(node_order));
-       i = 0;
-
         while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
                 /*
                  * We don't want to pressure a particular node.
@@ -5166,19 +5081,12 @@ static void build_zonelists(pg_data_t *pgdat)
                     node_distance(local_node, prev_node))
                         node_load[node] = load;
  
+               node_order[nr_nodes++] = node;
                 prev_node = node;
                 load--;
-               if (order == ZONELIST_ORDER_NODE)
-                       build_zonelists_in_node_order(pgdat, node);
-               else
-                       node_order[i++] = node; /* remember order */
-       }
-
-       if (order == ZONELIST_ORDER_ZONE) {
-               /* calculate node order -- i.e., DMA last! */
-               build_zonelists_in_zone_order(pgdat, i);
         }
  
+       build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
         build_thisnode_zonelists(pgdat);
  }
  
@@ -5204,21 +5112,17 @@ static void setup_min_unmapped_ratio(void);
  static void setup_min_slab_ratio(void);
  #else  /* CONFIG_NUMA */
  
-static void set_zonelist_order(void)
-{
-       current_zonelist_order = ZONELIST_ORDER_ZONE;
-}
-
  static void build_zonelists(pg_data_t *pgdat)
  {
         int node, local_node;
-       enum zone_type j;
-       struct zonelist *zonelist;
+       struct zoneref *zonerefs;
+       int nr_zones;
  
         local_node = pgdat->node_id;
  
-       zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
-       j = build_zonelists_node(pgdat, zonelist, 0);
+       zonerefs = pgdat->node_zonelists[ZONELIST_FALLBACK]._zonerefs;
+       nr_zones = build_zonerefs_node(pgdat, zonerefs);
+       zonerefs += nr_zones;
  
         /*
          * Now we build the zonelist so that it contains the zones
@@ -5231,16 +5135,18 @@ static void build_zonelists(pg_data_t *pgdat)
         for (node = local_node + 1; node < MAX_NUMNODES; node++) {
                 if (!node_online(node))
                         continue;
-               j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+               nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+               zonerefs += nr_zones;
         }
         for (node = 0; node < local_node; node++) {
                 if (!node_online(node))
                         continue;
-               j = build_zonelists_node(NODE_DATA(node), zonelist, j);
+               nr_zones = build_zonerefs_node(NODE_DATA(node), zonerefs);
+               zonerefs += nr_zones;
         }
  
-       zonelist->_zonerefs[j].zone = NULL;
-       zonelist->_zonerefs[j].zone_idx = 0;
+       zonerefs->zone = NULL;
+       zonerefs->zone_idx = 0;
  }
  
  #endif /* CONFIG_NUMA */
@@ -5263,50 +5169,32 @@ static void build_zonelists(pg_data_t *pgdat)
  static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
  static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
  static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
-static void setup_zone_pageset(struct zone *zone);
-
-/*
- * Global mutex to protect against size modification of zonelists
- * as well as to serialize pageset setup for the new populated zone.
- */
-DEFINE_MUTEX(zonelists_mutex);
  
-/* return values int ....just for stop_machine() */
-static int __build_all_zonelists(void *data)
+static void __build_all_zonelists(void *data)
  {
         int nid;
-       int cpu;
+       int __maybe_unused cpu;
         pg_data_t *self = data;
+       static DEFINE_SPINLOCK(lock);
+
+       spin_lock(&lock);
  
  #ifdef CONFIG_NUMA
         memset(node_load, 0, sizeof(node_load));
  #endif
  
+       /*
+        * This node is hotadded and no memory is yet present.   So just
+        * building zonelists is fine - no need to touch other nodes.
+        */
         if (self && !node_online(self->node_id)) {
                 build_zonelists(self);
-       }
-
-       for_each_online_node(nid) {
-               pg_data_t *pgdat = NODE_DATA(nid);
-
-               build_zonelists(pgdat);
-       }
+       } else {
+               for_each_online_node(nid) {
+                       pg_data_t *pgdat = NODE_DATA(nid);
  
-       /*
-        * Initialize the boot_pagesets that are going to be used
-        * for bootstrapping processors. The real pagesets for
-        * each zone will be allocated later when the per cpu
-        * allocator is available.
-        *
-        * boot_pagesets are used also for bootstrapping offline
-        * cpus if the system is already booted because the pagesets
-        * are needed to initialize allocators on a specific cpu too.
-        * F.e. the percpu allocator needs the page allocator which
-        * needs the percpu allocator in order to allocate its pagesets
-        * (a chicken-egg dilemma).
-        */
-       for_each_possible_cpu(cpu) {
-               setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+                       build_zonelists(pgdat);
+               }
  
  #ifdef CONFIG_HAVE_MEMORYLESS_NODES
                 /*
@@ -5317,45 +5205,53 @@ static int __build_all_zonelists(void *data)
                  * secondary cpus' numa_mem as they come on-line.  During
                  * node/memory hotplug, we'll fixup all on-line cpus.
                  */
-               if (cpu_online(cpu))
+               for_each_online_cpu(cpu)
                         set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu)));
  #endif
         }
  
-       return 0;
+       spin_unlock(&lock);
  }
  
  static noinline void __init
  build_all_zonelists_init(void)
  {
+       int cpu;
+
         __build_all_zonelists(NULL);
+
+       /*
+        * Initialize the boot_pagesets that are going to be used
+        * for bootstrapping processors. The real pagesets for
+        * each zone will be allocated later when the per cpu
+        * allocator is available.
+        *
+        * boot_pagesets are used also for bootstrapping offline
+        * cpus if the system is already booted because the pagesets
+        * are needed to initialize allocators on a specific cpu too.
+        * F.e. the percpu allocator needs the page allocator which
+        * needs the percpu allocator in order to allocate its pagesets
+        * (a chicken-egg dilemma).
+        */
+       for_each_possible_cpu(cpu)
+               setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+
         mminit_verify_zonelist();
         cpuset_init_current_mems_allowed();
  }
  
  /*
- * Called with zonelists_mutex held always
   * unless system_state == SYSTEM_BOOTING.
   *
- * __ref due to (1) call of __meminit annotated setup_zone_pageset
- * [we're only called with non-NULL zone through __meminit paths] and
- * (2) call of __init annotated helper build_all_zonelists_init
+ * __ref due to call of __init annotated helper build_all_zonelists_init
   * [protected by SYSTEM_BOOTING].
   */
-void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
+void __ref build_all_zonelists(pg_data_t *pgdat)
  {
-       set_zonelist_order();
-
         if (system_state == SYSTEM_BOOTING) {
                 build_all_zonelists_init();
         } else {
-#ifdef CONFIG_MEMORY_HOTPLUG
-               if (zone)
-                       setup_zone_pageset(zone);
-#endif
-               /* we have to stop all cpus to guarantee there is no user
-                  of zonelist */
-               stop_machine_cpuslocked(__build_all_zonelists, pgdat, NULL);
+               __build_all_zonelists(pgdat);
                 /* cpuset refresh routine should be here */
         }
         vm_total_pages = nr_free_pagecache_pages();
@@ -5371,9 +5267,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
         else
                 page_group_by_mobility_disabled = 0;
  
-       pr_info("Built %i zonelists in %s order, mobility grouping %s.  Total pages: %ld\n",
+       pr_info("Built %i zonelists, mobility grouping %s.  Total pages: %ld\n",
                 nr_online_nodes,
-               zonelist_order_name[current_zonelist_order],
                 page_group_by_mobility_disabled ? "off" : "on",
                 vm_total_pages);
  #ifdef CONFIG_NUMA
@@ -5472,6 +5367,7 @@ not_early:
  
                         __init_single_page(page, pfn, zone, nid);
                         set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+                       cond_resched();
                 } else {
                         __init_single_pfn(pfn, zone, nid);
                 }
@@ -5627,7 +5523,7 @@ static void __meminit zone_pageset_init(struct zone *zone, int cpu)
         pageset_set_high_and_batch(zone, pcp);
  }
  
-static void __meminit setup_zone_pageset(struct zone *zone)
+void __meminit setup_zone_pageset(struct zone *zone)
  {
         int cpu;
         zone->pageset = alloc_percpu(struct per_cpu_pageset);
@@ -7081,9 +6977,11 @@ static void __setup_per_zone_wmarks(void)
   */
  void setup_per_zone_wmarks(void)
  {
-       mutex_lock(&zonelists_mutex);
+       static DEFINE_SPINLOCK(lock);
+
+       spin_lock(&lock);
         __setup_per_zone_wmarks();
-       mutex_unlock(&zonelists_mutex);
+       spin_unlock(&lock);
  }
  
  /*