mm: memcontrol: clean up memory.events counting function

[linux-2.6-microblaze.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 77832f0..fbec74a 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -97,8 +97,13 @@ struct scan_control {
         /* Can pages be swapped as part of reclaim? */
         unsigned int may_swap:1;
  
-       /* Can cgroups be reclaimed below their normal consumption range? */
-       unsigned int may_thrash:1;
+       /*
+        * Cgroups are not reclaimed below their configured memory.low,
+        * unless we threaten to OOM. If any cgroups are skipped due to
+        * memory.low and nothing was reclaimed, go back for memory.low.
+        */
+       unsigned int memcg_low_reclaim:1;
+       unsigned int memcg_low_skipped:1;
  
         unsigned int hibernation_mode:1;
  
@@ -906,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page,
          * Anonymous pages are not handled by flushers and must be written
          * from reclaim context. Do not stall reclaim based on them
          */
-       if (!page_is_file_cache(page)) {
+       if (!page_is_file_cache(page) ||
+           (PageAnon(page) && !PageSwapBacked(page))) {
                 *dirty = false;
                 *writeback = false;
                 return;
@@ -966,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 int may_enter_fs;
                 enum page_references references = PAGEREF_RECLAIM_CLEAN;
                 bool dirty, writeback;
-               bool lazyfree = false;
-               int ret = SWAP_SUCCESS;
  
                 cond_resched();
  
@@ -982,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 sc->nr_scanned++;
  
                 if (unlikely(!page_evictable(page)))
-                       goto cull_mlocked;
+                       goto activate_locked;
  
                 if (!sc->may_unmap && page_mapped(page))
                         goto keep_locked;
  
                 /* Double the slab pressure for mapped and swapcache pages */
-               if (page_mapped(page) || PageSwapCache(page))
+               if ((page_mapped(page) || PageSwapCache(page)) &&
+                   !(PageAnon(page) && !PageSwapBacked(page)))
                         sc->nr_scanned++;
  
                 may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
@@ -1114,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 /*
                  * Anonymous process memory has backing store?
                  * Try to allocate it some swap space here.
+                * Lazyfree page could be freed directly
                  */
-               if (PageAnon(page) && !PageSwapCache(page)) {
+               if (PageAnon(page) && PageSwapBacked(page) &&
+                   !PageSwapCache(page)) {
                         if (!(sc->gfp_mask & __GFP_IO))
                                 goto keep_locked;
                         if (!add_to_swap(page, page_list))
                                 goto activate_locked;
-                       lazyfree = true;
                         may_enter_fs = 1;
  
                         /* Adding to swap updated mapping */
@@ -1137,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                  * The page is mapped into the page tables of one or more
                  * processes. Try to unmap it here.
                  */
-               if (page_mapped(page) && mapping) {
-                       switch (ret = try_to_unmap(page, lazyfree ?
-                               (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) :
-                               (ttu_flags | TTU_BATCH_FLUSH))) {
-                       case SWAP_FAIL:
+               if (page_mapped(page)) {
+                       if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) {
                                 nr_unmap_fail++;
                                 goto activate_locked;
-                       case SWAP_AGAIN:
-                               goto keep_locked;
-                       case SWAP_MLOCK:
-                               goto cull_mlocked;
-                       case SWAP_LZFREE:
-                               goto lazyfree;
-                       case SWAP_SUCCESS:
-                               ; /* try to free the page below */
                         }
                 }
  
@@ -1261,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         }
                 }
  
-lazyfree:
-               if (!mapping || !__remove_mapping(mapping, page, true))
-                       goto keep_locked;
+               if (PageAnon(page) && !PageSwapBacked(page)) {
+                       /* follow __remove_mapping for reference */
+                       if (!page_ref_freeze(page, 1))
+                               goto keep_locked;
+                       if (PageDirty(page)) {
+                               page_ref_unfreeze(page, 1);
+                               goto keep_locked;
+                       }
  
+                       count_vm_event(PGLAZYFREED);
+               } else if (!mapping || !__remove_mapping(mapping, page, true))
+                       goto keep_locked;
                 /*
                  * At this point, we have no other references and there is
                  * no way to pick any more up (removed from LRU, removed
@@ -1274,9 +1277,6 @@ lazyfree:
                  */
                 __ClearPageLocked(page);
  free_it:
-               if (ret == SWAP_LZFREE)
-                       count_vm_event(PGLAZYFREED);
-
                 nr_reclaimed++;
  
                 /*
@@ -1286,20 +1286,16 @@ free_it:
                 list_add(&page->lru, &free_pages);
                 continue;
  
-cull_mlocked:
-               if (PageSwapCache(page))
-                       try_to_free_swap(page);
-               unlock_page(page);
-               list_add(&page->lru, &ret_pages);
-               continue;
-
  activate_locked:
                 /* Not a candidate for swapping, so reclaim swap space. */
-               if (PageSwapCache(page) && mem_cgroup_swap_full(page))
+               if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
+                                               PageMlocked(page)))
                         try_to_free_swap(page);
                 VM_BUG_ON_PAGE(PageActive(page), page);
-               SetPageActive(page);
-               pgactivate++;
+               if (!PageMlocked(page)) {
+                       SetPageActive(page);
+                       pgactivate++;
+               }
  keep_locked:
                 unlock_page(page);
  keep:
@@ -1348,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
         }
  
         ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
-                       TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true);
+                       TTU_IGNORE_ACCESS, NULL, true);
         list_splice(&clean_pages, page_list);
         mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
         return ret;
@@ -1740,7 +1736,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         if (nr_taken == 0)
                 return 0;
  
-       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP,
+       nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
                                 &stat, false);
  
         spin_lock_irq(&pgdat->lru_lock);
@@ -2010,6 +2006,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
   * Both inactive lists should also be large enough that each inactive
   * page has a chance to be referenced again before it is reclaimed.
   *
+ * If that fails and refaulting is observed, the inactive list grows.
+ *
   * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages
   * on this LRU, maintained by the pageout code. A zone->inactive_ratio
   * of 3 means 3:1 or 25% of the pages are kept on the inactive list.
@@ -2026,12 +2024,15 @@ static void shrink_active_list(unsigned long nr_to_scan,
   *   10TB     320        32GB
   */
  static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                               struct scan_control *sc, bool trace)
+                                struct mem_cgroup *memcg,
+                                struct scan_control *sc, bool actual_reclaim)
  {
-       unsigned long inactive_ratio;
-       unsigned long inactive, active;
-       enum lru_list inactive_lru = file * LRU_FILE;
         enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+       enum lru_list inactive_lru = file * LRU_FILE;
+       unsigned long inactive, active;
+       unsigned long inactive_ratio;
+       unsigned long refaults;
         unsigned long gb;
  
         /*
@@ -2044,27 +2045,43 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
         inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
         active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
  
-       gb = (inactive + active) >> (30 - PAGE_SHIFT);
-       if (gb)
-               inactive_ratio = int_sqrt(10 * gb);
+       if (memcg)
+               refaults = mem_cgroup_read_stat(memcg,
+                                               MEMCG_WORKINGSET_ACTIVATE);
         else
-               inactive_ratio = 1;
+               refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
  
-       if (trace)
-               trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id,
-                               sc->reclaim_idx,
-                               lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-                               lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-                               inactive_ratio, file);
+       /*
+        * When refaults are being observed, it means a new workingset
+        * is being established. Disable active list protection to get
+        * rid of the stale workingset quickly.
+        */
+       if (file && actual_reclaim && lruvec->refaults != refaults) {
+               inactive_ratio = 0;
+       } else {
+               gb = (inactive + active) >> (30 - PAGE_SHIFT);
+               if (gb)
+                       inactive_ratio = int_sqrt(10 * gb);
+               else
+                       inactive_ratio = 1;
+       }
+
+       if (actual_reclaim)
+               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
+                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
+                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
+                       inactive_ratio, file);
  
         return inactive * inactive_ratio < active;
  }
  
  static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                struct lruvec *lruvec, struct scan_control *sc)
+                                struct lruvec *lruvec, struct mem_cgroup *memcg,
+                                struct scan_control *sc)
  {
         if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
+               if (inactive_list_is_low(lruvec, is_file_lru(lru),
+                                        memcg, sc, true))
                         shrink_active_list(nr_to_scan, lruvec, sc, lru);
                 return 0;
         }
@@ -2173,7 +2190,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
          * lruvec even if it has plenty of old anonymous pages unless the
          * system is under heavy pressure.
          */
-       if (!inactive_list_is_low(lruvec, true, sc, false) &&
+       if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
             lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
                 scan_balance = SCAN_FILE;
                 goto out;
@@ -2324,7 +2341,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
                                 nr[lru] -= nr_to_scan;
  
                                 nr_reclaimed += shrink_list(lru, nr_to_scan,
-                                                           lruvec, sc);
+                                                           lruvec, memcg, sc);
                         }
                 }
  
@@ -2391,7 +2408,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (inactive_list_is_low(lruvec, false, sc, true))
+       if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
  }
@@ -2505,9 +2522,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                         unsigned long scanned;
  
                         if (mem_cgroup_low(root, memcg)) {
-                               if (!sc->may_thrash)
+                               if (!sc->memcg_low_reclaim) {
+                                       sc->memcg_low_skipped = 1;
                                         continue;
-                               mem_cgroup_events(memcg, MEMCG_LOW, 1);
+                               }
+                               mem_cgroup_event(memcg, MEMCG_LOW);
                         }
  
                         reclaimed = sc->nr_reclaimed;
@@ -2705,6 +2724,26 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
         sc->gfp_mask = orig_mask;
  }
  
+static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+{
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
+       do {
+               unsigned long refaults;
+               struct lruvec *lruvec;
+
+               if (memcg)
+                       refaults = mem_cgroup_read_stat(memcg,
+                                               MEMCG_WORKINGSET_ACTIVATE);
+               else
+                       refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
+
+               lruvec = mem_cgroup_lruvec(pgdat, memcg);
+               lruvec->refaults = refaults;
+       } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+}
+
  /*
   * This is the main entry point to direct page reclaim.
   *
@@ -2725,6 +2764,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
                                           struct scan_control *sc)
  {
         int initial_priority = sc->priority;
+       pg_data_t *last_pgdat;
+       struct zoneref *z;
+       struct zone *zone;
  retry:
         delayacct_freepages_start();
  
@@ -2751,6 +2793,15 @@ retry:
                         sc->may_writepage = 1;
         } while (--sc->priority >= 0);
  
+       last_pgdat = NULL;
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
+                                       sc->nodemask) {
+               if (zone->zone_pgdat == last_pgdat)
+                       continue;
+               last_pgdat = zone->zone_pgdat;
+               snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+       }
+
         delayacct_freepages_end();
  
         if (sc->nr_reclaimed)
@@ -2761,9 +2812,10 @@ retry:
                 return 1;
  
         /* Untapped cgroup reserves?  Don't OOM, retry. */
-       if (!sc->may_thrash) {
+       if (sc->memcg_low_skipped) {
                 sc->priority = initial_priority;
-               sc->may_thrash = 1;
+               sc->memcg_low_reclaim = 1;
+               sc->memcg_low_skipped = 0;
                 goto retry;
         }
  
@@ -2908,7 +2960,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
         unsigned long nr_reclaimed;
         struct scan_control sc = {
                 .nr_to_reclaim = SWAP_CLUSTER_MAX,
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
                 .reclaim_idx = gfp_zone(gfp_mask),
                 .order = order,
                 .nodemask = nodemask,
@@ -2988,7 +3040,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
         int nid;
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+               .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
                                 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
                 .reclaim_idx = MAX_NR_ZONES - 1,
                 .target_mem_cgroup = memcg,
@@ -3034,7 +3086,7 @@ static void age_active_anon(struct pglist_data *pgdat,
         do {
                 struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
  
-               if (inactive_list_is_low(lruvec, false, sc, true))
+               if (inactive_list_is_low(lruvec, false, memcg, sc, true))
                         shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                            sc, LRU_ACTIVE_ANON);
  
@@ -3042,22 +3094,44 @@ static void age_active_anon(struct pglist_data *pgdat,
         } while (memcg);
  }
  
-static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
+/*
+ * Returns true if there is an eligible zone balanced for the request order
+ * and classzone_idx
+ */
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
  {
-       unsigned long mark = high_wmark_pages(zone);
+       int i;
+       unsigned long mark = -1;
+       struct zone *zone;
  
-       if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx))
-               return false;
+       for (i = 0; i <= classzone_idx; i++) {
+               zone = pgdat->node_zones + i;
+
+               if (!managed_zone(zone))
+                       continue;
+
+               mark = high_wmark_pages(zone);
+               if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+                       return true;
+       }
  
         /*
-        * If any eligible zone is balanced then the node is not considered
-        * to be congested or dirty
+        * If a node has no populated zone within classzone_idx, it does not
+        * need balancing by definition. This can happen if a zone-restricted
+        * allocation tries to wake a remote kswapd.
          */
-       clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags);
-       clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags);
-       clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags);
+       if (mark == -1)
+               return true;
  
-       return true;
+       return false;
+}
+
+/* Clear pgdat state for congested, dirty or under writeback. */
+static void clear_pgdat_congested(pg_data_t *pgdat)
+{
+       clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+       clear_bit(PGDAT_DIRTY, &pgdat->flags);
+       clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
  }
  
  /*
@@ -3068,8 +3142,6 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx)
   */
  static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
  {
-       int i;
-
         /*
          * The throttled processes are normally woken up in balance_pgdat() as
          * soon as allow_direct_reclaim() is true. But there is a potential
@@ -3090,17 +3162,12 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
                 return true;
  
-       for (i = 0; i <= classzone_idx; i++) {
-               struct zone *zone = pgdat->node_zones + i;
-
-               if (!managed_zone(zone))
-                       continue;
-
-               if (!zone_balanced(zone, order, classzone_idx))
-                       return false;
+       if (pgdat_balanced(pgdat, order, classzone_idx)) {
+               clear_pgdat_congested(pgdat);
+               return true;
         }
  
-       return true;
+       return false;
  }
  
  /*
@@ -3203,23 +3270,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 }
  
                 /*
-                * Only reclaim if there are no eligible zones. Check from
-                * high to low zone as allocations prefer higher zones.
-                * Scanning from low to high zone would allow congestion to be
-                * cleared during a very small window when a small low
-                * zone was balanced even under extreme pressure when the
-                * overall node may be congested. Note that sc.reclaim_idx
-                * is not used as buffer_heads_over_limit may have adjusted
-                * it.
+                * Only reclaim if there are no eligible zones. Note that
+                * sc.reclaim_idx is not used as buffer_heads_over_limit may
+                * have adjusted it.
                  */
-               for (i = classzone_idx; i >= 0; i--) {
-                       zone = pgdat->node_zones + i;
-                       if (!managed_zone(zone))
-                               continue;
-
-                       if (zone_balanced(zone, sc.order, classzone_idx))
-                               goto out;
-               }
+               if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+                       goto out;
  
                 /*
                  * Do some background aging of the anon list, to give
@@ -3277,6 +3333,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
                 pgdat->kswapd_failures++;
  
  out:
+       snapshot_refaults(NULL, pgdat);
         /*
          * Return the order kswapd stopped reclaiming at as
          * prepare_kswapd_sleep() takes it into account. If another caller
@@ -3286,6 +3343,22 @@ out:
         return sc.order;
  }
  
+/*
+ * pgdat->kswapd_classzone_idx is the highest zone index that a recent
+ * allocation request woke kswapd for. When kswapd has not woken recently,
+ * the value is MAX_NR_ZONES which is not a valid index. This compares a
+ * given classzone and returns it or the highest classzone index kswapd
+ * was recently woke for.
+ */
+static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
+                                          enum zone_type classzone_idx)
+{
+       if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+               return classzone_idx;
+
+       return max(pgdat->kswapd_classzone_idx, classzone_idx);
+}
+
  static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
                                 unsigned int classzone_idx)
  {
@@ -3297,7 +3370,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
  
         prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  
-       /* Try to sleep for a short interval */
+       /*
+        * Try to sleep for a short interval. Note that kcompactd will only be
+        * woken if it is possible to sleep for a short interval. This is
+        * deliberate on the assumption that if reclaim cannot keep an
+        * eligible zone balanced that it's also unlikely that compaction will
+        * succeed.
+        */
         if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
                 /*
                  * Compaction records what page blocks it recently failed to
@@ -3321,7 +3400,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
                  * the previous request that slept prematurely.
                  */
                 if (remaining) {
-                       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+                       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
                         pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
                 }
  
@@ -3375,7 +3454,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
   */
  static int kswapd(void *p)
  {
-       unsigned int alloc_order, reclaim_order, classzone_idx;
+       unsigned int alloc_order, reclaim_order;
+       unsigned int classzone_idx = MAX_NR_ZONES - 1;
         pg_data_t *pgdat = (pg_data_t*)p;
         struct task_struct *tsk = current;
  
@@ -3405,20 +3485,23 @@ static int kswapd(void *p)
         tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
         set_freezable();
  
-       pgdat->kswapd_order = alloc_order = reclaim_order = 0;
-       pgdat->kswapd_classzone_idx = classzone_idx = 0;
+       pgdat->kswapd_order = 0;
+       pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
         for ( ; ; ) {
                 bool ret;
  
+               alloc_order = reclaim_order = pgdat->kswapd_order;
+               classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+
  kswapd_try_sleep:
                 kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
                                         classzone_idx);
  
                 /* Read the new order and classzone_idx */
                 alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = pgdat->kswapd_classzone_idx;
+               classzone_idx = kswapd_classzone_idx(pgdat, 0);
                 pgdat->kswapd_order = 0;
-               pgdat->kswapd_classzone_idx = 0;
+               pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
  
                 ret = try_to_freeze();
                 if (kthread_should_stop())
@@ -3444,9 +3527,6 @@ kswapd_try_sleep:
                 reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
                 if (reclaim_order < alloc_order)
                         goto kswapd_try_sleep;
-
-               alloc_order = reclaim_order = pgdat->kswapd_order;
-               classzone_idx = pgdat->kswapd_classzone_idx;
         }
  
         tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
@@ -3462,7 +3542,6 @@ kswapd_try_sleep:
  void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
  {
         pg_data_t *pgdat;
-       int z;
  
         if (!managed_zone(zone))
                 return;
@@ -3470,7 +3549,8 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
                 return;
         pgdat = zone->zone_pgdat;
-       pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx);
+       pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
+                                                          classzone_idx);
         pgdat->kswapd_order = max(pgdat->kswapd_order, order);
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
@@ -3479,17 +3559,10 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
                 return;
  
-       /* Only wake kswapd if all zones are unbalanced */
-       for (z = 0; z <= classzone_idx; z++) {
-               zone = pgdat->node_zones + z;
-               if (!managed_zone(zone))
-                       continue;
-
-               if (zone_balanced(zone, order, classzone_idx))
-                       return;
-       }
+       if (pgdat_balanced(pgdat, order, classzone_idx))
+               return;
  
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order);
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
         wake_up_interruptible(&pgdat->kswapd_wait);
  }
  
@@ -3695,7 +3768,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
         int classzone_idx = gfp_zone(gfp_mask);
         struct scan_control sc = {
                 .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
-               .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)),
+               .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)),
                 .order = order,
                 .priority = NODE_RECLAIM_PRIORITY,
                 .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),