drm/via: set FOLL_PIN via pin_user_pages_fast()
[linux-2.6-microblaze.git] / mm / vmscan.c
index ee4eecc..572fb17 100644 (file)
@@ -79,6 +79,13 @@ struct scan_control {
         */
        struct mem_cgroup *target_mem_cgroup;
 
+       /* Can active pages be deactivated as part of reclaim? */
+#define DEACTIVATE_ANON 1
+#define DEACTIVATE_FILE 2
+       unsigned int may_deactivate:2;
+       unsigned int force_deactivate:1;
+       unsigned int skipped_deactivate:1;
+
        /* Writepage batching in laptop mode; RECLAIM_WRITE */
        unsigned int may_writepage:1;
 
@@ -101,6 +108,12 @@ struct scan_control {
        /* One of the zones is ready for compaction */
        unsigned int compaction_ready:1;
 
+       /* There is easily reclaimable cold cache in the current node */
+       unsigned int cache_trim_mode:1;
+
+       /* The file pages on the current node are dangerously low */
+       unsigned int file_is_tiny:1;
+
        /* Allocation order */
        s8 order;
 
@@ -239,13 +252,13 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
        up_write(&shrinker_rwsem);
 }
 
-static bool global_reclaim(struct scan_control *sc)
+static bool cgroup_reclaim(struct scan_control *sc)
 {
-       return !sc->target_mem_cgroup;
+       return sc->target_mem_cgroup;
 }
 
 /**
- * sane_reclaim - is the usual dirty throttling mechanism operational?
+ * writeback_throttling_sane - is the usual dirty throttling mechanism available?
  * @sc: scan_control in question
  *
  * The normal page dirty throttling mechanism in balance_dirty_pages() is
@@ -257,11 +270,9 @@ static bool global_reclaim(struct scan_control *sc)
  * This function tests whether the vmscan currently in progress can assume
  * that the normal dirty throttling mechanism is operational.
  */
-static bool sane_reclaim(struct scan_control *sc)
+static bool writeback_throttling_sane(struct scan_control *sc)
 {
-       struct mem_cgroup *memcg = sc->target_mem_cgroup;
-
-       if (!memcg)
+       if (!cgroup_reclaim(sc))
                return true;
 #ifdef CONFIG_CGROUP_WRITEBACK
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
@@ -269,29 +280,6 @@ static bool sane_reclaim(struct scan_control *sc)
 #endif
        return false;
 }
-
-static void set_memcg_congestion(pg_data_t *pgdat,
-                               struct mem_cgroup *memcg,
-                               bool congested)
-{
-       struct mem_cgroup_per_node *mn;
-
-       if (!memcg)
-               return;
-
-       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
-       WRITE_ONCE(mn->congested, congested);
-}
-
-static bool memcg_congested(pg_data_t *pgdat,
-                       struct mem_cgroup *memcg)
-{
-       struct mem_cgroup_per_node *mn;
-
-       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
-       return READ_ONCE(mn->congested);
-
-}
 #else
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
@@ -302,27 +290,15 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 {
 }
 
-static bool global_reclaim(struct scan_control *sc)
+static bool cgroup_reclaim(struct scan_control *sc)
 {
-       return true;
+       return false;
 }
 
-static bool sane_reclaim(struct scan_control *sc)
+static bool writeback_throttling_sane(struct scan_control *sc)
 {
        return true;
 }
-
-static inline void set_memcg_congestion(struct pglist_data *pgdat,
-                               struct mem_cgroup *memcg, bool congested)
-{
-}
-
-static inline bool memcg_congested(struct pglist_data *pgdat,
-                       struct mem_cgroup *memcg)
-{
-       return false;
-
-}
 #endif
 
 /*
@@ -351,32 +327,21 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  */
 unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
 {
-       unsigned long lru_size = 0;
+       unsigned long size = 0;
        int zid;
 
-       if (!mem_cgroup_disabled()) {
-               for (zid = 0; zid < MAX_NR_ZONES; zid++)
-                       lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
-       } else
-               lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
-
-       for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
+       for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) {
                struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
-               unsigned long size;
 
                if (!managed_zone(zone))
                        continue;
 
                if (!mem_cgroup_disabled())
-                       size = mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+                       size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
                else
-                       size = zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zid],
-                                      NR_ZONE_LRU_BASE + lru);
-               lru_size -= min(size, lru_size);
+                       size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
        }
-
-       return lru_size;
-
+       return size;
 }
 
 /*
@@ -422,7 +387,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
        if (shrinker->flags & SHRINKER_MEMCG_AWARE)
                idr_replace(&shrinker_idr, shrinker, shrinker->id);
 #endif
@@ -775,7 +740,7 @@ static inline int is_page_cache_freeable(struct page *page)
        return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
 }
 
-static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
+static int may_write_to_inode(struct inode *inode)
 {
        if (current->flags & PF_SWAPWRITE)
                return 1;
@@ -823,8 +788,7 @@ typedef enum {
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping,
-                        struct scan_control *sc)
+static pageout_t pageout(struct page *page, struct address_space *mapping)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -860,7 +824,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
        }
        if (mapping->a_ops->writepage == NULL)
                return PAGE_ACTIVATE;
-       if (!may_write_to_inode(mapping->host, sc))
+       if (!may_write_to_inode(mapping->host))
                return PAGE_KEEP;
 
        if (clear_page_dirty_for_io(page)) {
@@ -899,7 +863,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
  * gets returned with a refcount of 0.
  */
 static int __remove_mapping(struct address_space *mapping, struct page *page,
-                           bool reclaimed)
+                           bool reclaimed, struct mem_cgroup *target_memcg)
 {
        unsigned long flags;
        int refcount;
@@ -971,7 +935,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 */
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
-                       shadow = workingset_eviction(page);
+                       shadow = workingset_eviction(page, target_memcg);
                __delete_from_page_cache(page, shadow);
                xa_unlock_irqrestore(&mapping->i_pages, flags);
 
@@ -994,7 +958,7 @@ cannot_free:
  */
 int remove_mapping(struct address_space *mapping, struct page *page)
 {
-       if (__remove_mapping(mapping, page, false)) {
+       if (__remove_mapping(mapping, page, false, NULL)) {
                /*
                 * Unfreezing the refcount with 1 rather than 2 effectively
                 * drops the pagecache ref for us without requiring another
@@ -1239,7 +1203,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto activate_locked;
 
                        /* Case 2 above */
-                       } else if (sane_reclaim(sc) ||
+                       } else if (writeback_throttling_sane(sc) ||
                            !PageReclaim(page) || !may_enter_fs) {
                                /*
                                 * This is slightly racy - end_page_writeback()
@@ -1394,7 +1358,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         * starts and then write it out here.
                         */
                        try_to_unmap_flush_dirty();
-                       switch (pageout(page, mapping, sc)) {
+                       switch (pageout(page, mapping)) {
                        case PAGE_KEEP:
                                goto keep_locked;
                        case PAGE_ACTIVATE:
@@ -1472,7 +1436,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
                        count_vm_event(PGLAZYFREED);
                        count_memcg_page_event(page, PGLAZYFREED);
-               } else if (!mapping || !__remove_mapping(mapping, page, true))
+               } else if (!mapping || !__remove_mapping(mapping, page, true,
+                                                        sc->target_mem_cgroup))
                        goto keep_locked;
 
                unlock_page(page);
@@ -1820,7 +1785,7 @@ int isolate_lru_page(struct page *page)
 
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
- * then get resheduled. When there are massive number of tasks doing page
+ * then get rescheduled. When there are massive number of tasks doing page
  * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
  * the LRU list will go small and be scanned faster than necessary, leading to
  * unnecessary swapping, thrashing and OOM.
@@ -1833,7 +1798,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
        if (current_is_kswapd())
                return 0;
 
-       if (!sane_reclaim(sc))
+       if (!writeback_throttling_sane(sc))
                return 0;
 
        if (file) {
@@ -1983,7 +1948,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        reclaim_stat->recent_scanned[file] += nr_taken;
 
        item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
-       if (global_reclaim(sc))
+       if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_scanned);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
        spin_unlock_irq(&pgdat->lru_lock);
@@ -1997,7 +1962,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        spin_lock_irq(&pgdat->lru_lock);
 
        item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
-       if (global_reclaim(sc))
+       if (!cgroup_reclaim(sc))
                __count_vm_events(item, nr_reclaimed);
        __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
        reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
@@ -2199,6 +2164,20 @@ unsigned long reclaim_pages(struct list_head *page_list)
        return nr_reclaimed;
 }
 
+static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
+                                struct lruvec *lruvec, struct scan_control *sc)
+{
+       if (is_active_lru(lru)) {
+               if (sc->may_deactivate & (1 << is_file_lru(lru)))
+                       shrink_active_list(nr_to_scan, lruvec, sc, lru);
+               else
+                       sc->skipped_deactivate = 1;
+               return 0;
+       }
+
+       return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
+}
+
 /*
  * The inactive anon list should be small enough that the VM never has
  * to do too much work.
@@ -2227,64 +2206,25 @@ unsigned long reclaim_pages(struct list_head *page_list)
  *    1TB     101        10GB
  *   10TB     320        32GB
  */
-static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
-                                struct scan_control *sc, bool trace)
+static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
 {
-       enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
-       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
-       enum lru_list inactive_lru = file * LRU_FILE;
+       enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
        unsigned long inactive, active;
        unsigned long inactive_ratio;
-       unsigned long refaults;
        unsigned long gb;
 
-       /*
-        * If we don't have swap space, anonymous page deactivation
-        * is pointless.
-        */
-       if (!file && !total_swap_pages)
-               return false;
-
-       inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
-       active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
-
-       /*
-        * When refaults are being observed, it means a new workingset
-        * is being established. Disable active list protection to get
-        * rid of the stale workingset quickly.
-        */
-       refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-       if (file && lruvec->refaults != refaults) {
-               inactive_ratio = 0;
-       } else {
-               gb = (inactive + active) >> (30 - PAGE_SHIFT);
-               if (gb)
-                       inactive_ratio = int_sqrt(10 * gb);
-               else
-                       inactive_ratio = 1;
-       }
+       inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
+       active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
 
-       if (trace)
-               trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
-                       lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
-                       lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
-                       inactive_ratio, file);
+       gb = (inactive + active) >> (30 - PAGE_SHIFT);
+       if (gb)
+               inactive_ratio = int_sqrt(10 * gb);
+       else
+               inactive_ratio = 1;
 
        return inactive * inactive_ratio < active;
 }
 
-static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
-                                struct lruvec *lruvec, struct scan_control *sc)
-{
-       if (is_active_lru(lru)) {
-               if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
-                       shrink_active_list(nr_to_scan, lruvec, sc, lru);
-               return 0;
-       }
-
-       return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
-}
-
 enum scan_balance {
        SCAN_EQUAL,
        SCAN_FRACT,
@@ -2301,10 +2241,10 @@ enum scan_balance {
  * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
-static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
-                          struct scan_control *sc, unsigned long *nr,
-                          unsigned long *lru_pages)
+static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
+                          unsigned long *nr)
 {
+       struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        int swappiness = mem_cgroup_swappiness(memcg);
        struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
        u64 fraction[2];
@@ -2329,7 +2269,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
         * using the memory controller's swap limit feature would be
         * too expensive.
         */
-       if (!global_reclaim(sc) && !swappiness) {
+       if (cgroup_reclaim(sc) && !swappiness) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2345,58 +2285,18 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        }
 
        /*
-        * Prevent the reclaimer from falling into the cache trap: as
-        * cache pages start out inactive, every cache fault will tip
-        * the scan balance towards the file LRU.  And as the file LRU
-        * shrinks, so does the window for rotation from references.
-        * This means we have a runaway feedback loop where a tiny
-        * thrashing file LRU becomes infinitely more attractive than
-        * anon pages.  Try to detect this based on file LRU size.
+        * If the system is almost out of file pages, force-scan anon.
         */
-       if (global_reclaim(sc)) {
-               unsigned long pgdatfile;
-               unsigned long pgdatfree;
-               int z;
-               unsigned long total_high_wmark = 0;
-
-               pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
-               pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) +
-                          node_page_state(pgdat, NR_INACTIVE_FILE);
-
-               for (z = 0; z < MAX_NR_ZONES; z++) {
-                       struct zone *zone = &pgdat->node_zones[z];
-                       if (!managed_zone(zone))
-                               continue;
-
-                       total_high_wmark += high_wmark_pages(zone);
-               }
-
-               if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) {
-                       /*
-                        * Force SCAN_ANON if there are enough inactive
-                        * anonymous pages on the LRU in eligible zones.
-                        * Otherwise, the small LRU gets thrashed.
-                        */
-                       if (!inactive_list_is_low(lruvec, false, sc, false) &&
-                           lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
-                                       >> sc->priority) {
-                               scan_balance = SCAN_ANON;
-                               goto out;
-                       }
-               }
+       if (sc->file_is_tiny) {
+               scan_balance = SCAN_ANON;
+               goto out;
        }
 
        /*
-        * If there is enough inactive page cache, i.e. if the size of the
-        * inactive list is greater than that of the active list *and* the
-        * inactive list actually has some pages to scan on this priority, we
-        * do not reclaim anything from the anonymous working set right now.
-        * Without the second condition we could end up never scanning an
-        * lruvec even if it has plenty of old anonymous pages unless the
-        * system is under heavy pressure.
+        * If there is enough inactive page cache, we do not reclaim
+        * anything from the anonymous working right now.
         */
-       if (!inactive_list_is_low(lruvec, true, sc, false) &&
-           lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
+       if (sc->cache_trim_mode) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2454,7 +2354,6 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg,
        fraction[1] = fp;
        denominator = ap + fp + 1;
 out:
-       *lru_pages = 0;
        for_each_evictable_lru(lru) {
                int file = is_file_lru(lru);
                unsigned long lruvec_size;
@@ -2549,18 +2448,12 @@ out:
                        BUG();
                }
 
-               *lru_pages += lruvec_size;
                nr[lru] = scan;
        }
 }
 
-/*
- * This is a basic per-node page freer.  Used by both kswapd and direct reclaim.
- */
-static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg,
-                             struct scan_control *sc, unsigned long *lru_pages)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
-       struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
        unsigned long nr[NR_LRU_LISTS];
        unsigned long targets[NR_LRU_LISTS];
        unsigned long nr_to_scan;
@@ -2570,7 +2463,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
        struct blk_plug plug;
        bool scan_adjusted;
 
-       get_scan_count(lruvec, memcg, sc, nr, lru_pages);
+       get_scan_count(lruvec, sc, nr);
 
        /* Record the original scan target for proportional adjustments later */
        memcpy(targets, nr, sizeof(nr));
@@ -2586,7 +2479,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * abort proportional reclaim if either the file or anon lru has already
         * dropped to zero at the first pass.
         */
-       scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() &&
+       scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
                         sc->priority == DEF_PRIORITY);
 
        blk_start_plug(&plug);
@@ -2668,7 +2561,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (inactive_list_is_low(lruvec, false, sc, true))
+       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2744,156 +2637,234 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
        return inactive_lru_pages > pages_for_compaction;
 }
 
-static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
 {
-       return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
-               (memcg && memcg_congested(pgdat, memcg));
+       struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
+       struct mem_cgroup *memcg;
+
+       memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
+       do {
+               struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
+               unsigned long reclaimed;
+               unsigned long scanned;
+
+               switch (mem_cgroup_protected(target_memcg, memcg)) {
+               case MEMCG_PROT_MIN:
+                       /*
+                        * Hard protection.
+                        * If there is no reclaimable memory, OOM.
+                        */
+                       continue;
+               case MEMCG_PROT_LOW:
+                       /*
+                        * Soft protection.
+                        * Respect the protection only as long as
+                        * there is an unprotected supply
+                        * of reclaimable memory from other cgroups.
+                        */
+                       if (!sc->memcg_low_reclaim) {
+                               sc->memcg_low_skipped = 1;
+                               continue;
+                       }
+                       memcg_memory_event(memcg, MEMCG_LOW);
+                       break;
+               case MEMCG_PROT_NONE:
+                       /*
+                        * All protection thresholds breached. We may
+                        * still choose to vary the scan pressure
+                        * applied based on by how much the cgroup in
+                        * question has exceeded its protection
+                        * thresholds (see get_scan_count).
+                        */
+                       break;
+               }
+
+               reclaimed = sc->nr_reclaimed;
+               scanned = sc->nr_scanned;
+
+               shrink_lruvec(lruvec, sc);
+
+               shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+                           sc->priority);
+
+               /* Record the group's reclaim efficiency */
+               vmpressure(sc->gfp_mask, memcg, false,
+                          sc->nr_scanned - scanned,
+                          sc->nr_reclaimed - reclaimed);
+
+       } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
 }
 
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long nr_reclaimed, nr_scanned;
+       struct lruvec *target_lruvec;
        bool reclaimable = false;
+       unsigned long file;
 
-       do {
-               struct mem_cgroup *root = sc->target_mem_cgroup;
-               unsigned long node_lru_pages = 0;
-               struct mem_cgroup *memcg;
+       target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
-               memset(&sc->nr, 0, sizeof(sc->nr));
+again:
+       memset(&sc->nr, 0, sizeof(sc->nr));
 
-               nr_reclaimed = sc->nr_reclaimed;
-               nr_scanned = sc->nr_scanned;
+       nr_reclaimed = sc->nr_reclaimed;
+       nr_scanned = sc->nr_scanned;
 
-               memcg = mem_cgroup_iter(root, NULL, NULL);
-               do {
-                       unsigned long lru_pages;
-                       unsigned long reclaimed;
-                       unsigned long scanned;
+       /*
+        * Target desirable inactive:active list ratios for the anon
+        * and file LRU lists.
+        */
+       if (!sc->force_deactivate) {
+               unsigned long refaults;
 
-                       switch (mem_cgroup_protected(root, memcg)) {
-                       case MEMCG_PROT_MIN:
-                               /*
-                                * Hard protection.
-                                * If there is no reclaimable memory, OOM.
-                                */
-                               continue;
-                       case MEMCG_PROT_LOW:
-                               /*
-                                * Soft protection.
-                                * Respect the protection only as long as
-                                * there is an unprotected supply
-                                * of reclaimable memory from other cgroups.
-                                */
-                               if (!sc->memcg_low_reclaim) {
-                                       sc->memcg_low_skipped = 1;
-                                       continue;
-                               }
-                               memcg_memory_event(memcg, MEMCG_LOW);
-                               break;
-                       case MEMCG_PROT_NONE:
-                               /*
-                                * All protection thresholds breached. We may
-                                * still choose to vary the scan pressure
-                                * applied based on by how much the cgroup in
-                                * question has exceeded its protection
-                                * thresholds (see get_scan_count).
-                                */
-                               break;
-                       }
+               if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+                       sc->may_deactivate |= DEACTIVATE_ANON;
+               else
+                       sc->may_deactivate &= ~DEACTIVATE_ANON;
 
-                       reclaimed = sc->nr_reclaimed;
-                       scanned = sc->nr_scanned;
-                       shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
-                       node_lru_pages += lru_pages;
+               /*
+                * When refaults are being observed, it means a new
+                * workingset is being established. Deactivate to get
+                * rid of any stale active pages quickly.
+                */
+               refaults = lruvec_page_state(target_lruvec,
+                                            WORKINGSET_ACTIVATE);
+               if (refaults != target_lruvec->refaults ||
+                   inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
+                       sc->may_deactivate |= DEACTIVATE_FILE;
+               else
+                       sc->may_deactivate &= ~DEACTIVATE_FILE;
+       } else
+               sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
 
-                       shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
-                                       sc->priority);
+       /*
+        * If we have plenty of inactive file pages that aren't
+        * thrashing, try to reclaim those first before touching
+        * anonymous pages.
+        */
+       file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
+       if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
+               sc->cache_trim_mode = 1;
+       else
+               sc->cache_trim_mode = 0;
+
+       /*
+        * Prevent the reclaimer from falling into the cache trap: as
+        * cache pages start out inactive, every cache fault will tip
+        * the scan balance towards the file LRU.  And as the file LRU
+        * shrinks, so does the window for rotation from references.
+        * This means we have a runaway feedback loop where a tiny
+        * thrashing file LRU becomes infinitely more attractive than
+        * anon pages.  Try to detect this based on file LRU size.
+        */
+       if (!cgroup_reclaim(sc)) {
+               unsigned long total_high_wmark = 0;
+               unsigned long free, anon;
+               int z;
 
-                       /* Record the group's reclaim efficiency */
-                       vmpressure(sc->gfp_mask, memcg, false,
-                                  sc->nr_scanned - scanned,
-                                  sc->nr_reclaimed - reclaimed);
+               free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
+               file = node_page_state(pgdat, NR_ACTIVE_FILE) +
+                          node_page_state(pgdat, NR_INACTIVE_FILE);
 
-               } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
+               for (z = 0; z < MAX_NR_ZONES; z++) {
+                       struct zone *zone = &pgdat->node_zones[z];
+                       if (!managed_zone(zone))
+                               continue;
 
-               if (reclaim_state) {
-                       sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-                       reclaim_state->reclaimed_slab = 0;
+                       total_high_wmark += high_wmark_pages(zone);
                }
 
-               /* Record the subtree's reclaim efficiency */
-               vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
-                          sc->nr_scanned - nr_scanned,
-                          sc->nr_reclaimed - nr_reclaimed);
+               /*
+                * Consider anon: if that's low too, this isn't a
+                * runaway file reclaim problem, but rather just
+                * extreme pressure. Reclaim as per usual then.
+                */
+               anon = node_page_state(pgdat, NR_INACTIVE_ANON);
 
-               if (sc->nr_reclaimed - nr_reclaimed)
-                       reclaimable = true;
+               sc->file_is_tiny =
+                       file + free <= total_high_wmark &&
+                       !(sc->may_deactivate & DEACTIVATE_ANON) &&
+                       anon >> sc->priority;
+       }
 
-               if (current_is_kswapd()) {
-                       /*
-                        * If reclaim is isolating dirty pages under writeback,
-                        * it implies that the long-lived page allocation rate
-                        * is exceeding the page laundering rate. Either the
-                        * global limits are not being effective at throttling
-                        * processes due to the page distribution throughout
-                        * zones or there is heavy usage of a slow backing
-                        * device. The only option is to throttle from reclaim
-                        * context which is not ideal as there is no guarantee
-                        * the dirtying process is throttled in the same way
-                        * balance_dirty_pages() manages.
-                        *
-                        * Once a node is flagged PGDAT_WRITEBACK, kswapd will
-                        * count the number of pages under pages flagged for
-                        * immediate reclaim and stall if any are encountered
-                        * in the nr_immediate check below.
-                        */
-                       if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
-                               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+       shrink_node_memcgs(pgdat, sc);
 
-                       /*
-                        * Tag a node as congested if all the dirty pages
-                        * scanned were backed by a congested BDI and
-                        * wait_iff_congested will stall.
-                        */
-                       if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
-                               set_bit(PGDAT_CONGESTED, &pgdat->flags);
+       if (reclaim_state) {
+               sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+               reclaim_state->reclaimed_slab = 0;
+       }
 
-                       /* Allow kswapd to start writing pages during reclaim.*/
-                       if (sc->nr.unqueued_dirty == sc->nr.file_taken)
-                               set_bit(PGDAT_DIRTY, &pgdat->flags);
+       /* Record the subtree's reclaim efficiency */
+       vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
+                  sc->nr_scanned - nr_scanned,
+                  sc->nr_reclaimed - nr_reclaimed);
 
-                       /*
-                        * If kswapd scans pages marked marked for immediate
-                        * reclaim and under writeback (nr_immediate), it
-                        * implies that pages are cycling through the LRU
-                        * faster than they are written so also forcibly stall.
-                        */
-                       if (sc->nr.immediate)
-                               congestion_wait(BLK_RW_ASYNC, HZ/10);
-               }
+       if (sc->nr_reclaimed - nr_reclaimed)
+               reclaimable = true;
 
+       if (current_is_kswapd()) {
                /*
-                * Legacy memcg will stall in page writeback so avoid forcibly
-                * stalling in wait_iff_congested().
+                * If reclaim is isolating dirty pages under writeback,
+                * it implies that the long-lived page allocation rate
+                * is exceeding the page laundering rate. Either the
+                * global limits are not being effective at throttling
+                * processes due to the page distribution throughout
+                * zones or there is heavy usage of a slow backing
+                * device. The only option is to throttle from reclaim
+                * context which is not ideal as there is no guarantee
+                * the dirtying process is throttled in the same way
+                * balance_dirty_pages() manages.
+                *
+                * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+                * count the number of pages under pages flagged for
+                * immediate reclaim and stall if any are encountered
+                * in the nr_immediate check below.
                 */
-               if (!global_reclaim(sc) && sane_reclaim(sc) &&
-                   sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
-                       set_memcg_congestion(pgdat, root, true);
+               if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+                       set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+               /* Allow kswapd to start writing pages during reclaim.*/
+               if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+                       set_bit(PGDAT_DIRTY, &pgdat->flags);
 
                /*
-                * Stall direct reclaim for IO completions if underlying BDIs
-                * and node is congested. Allow kswapd to continue until it
-                * starts encountering unqueued dirty pages or cycling through
-                * the LRU too quickly.
+                * If kswapd scans pages marked marked for immediate
+                * reclaim and under writeback (nr_immediate), it
+                * implies that pages are cycling through the LRU
+                * faster than they are written so also forcibly stall.
                 */
-               if (!sc->hibernation_mode && !current_is_kswapd() &&
-                  current_may_throttle() && pgdat_memcg_congested(pgdat, root))
-                       wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+               if (sc->nr.immediate)
+                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+       }
 
-       } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
-                                        sc));
+       /*
+        * Tag a node/memcg as congested if all the dirty pages
+        * scanned were backed by a congested BDI and
+        * wait_iff_congested will stall.
+        *
+        * Legacy memcg will stall in page writeback so avoid forcibly
+        * stalling in wait_iff_congested().
+        */
+       if ((current_is_kswapd() ||
+            (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
+           sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+               set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
+
+       /*
+        * Stall direct reclaim for IO completions if underlying BDIs
+        * and node is congested. Allow kswapd to continue until it
+        * starts encountering unqueued dirty pages or cycling through
+        * the LRU too quickly.
+        */
+       if (!current_is_kswapd() && current_may_throttle() &&
+           !sc->hibernation_mode &&
+           test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
+               wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
+       if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
+                                   sc))
+               goto again;
 
        /*
         * Kswapd gives up on balancing particular nodes after too
@@ -2973,7 +2944,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
                 * Take care memory controller reclaiming has small influence
                 * to global LRU.
                 */
-               if (global_reclaim(sc)) {
+               if (!cgroup_reclaim(sc)) {
                        if (!cpuset_zone_allowed(zone,
                                                 GFP_KERNEL | __GFP_HARDWALL))
                                continue;
@@ -3032,19 +3003,14 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
        sc->gfp_mask = orig_mask;
 }
 
-static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat)
+static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
 {
-       struct mem_cgroup *memcg;
-
-       memcg = mem_cgroup_iter(root_memcg, NULL, NULL);
-       do {
-               unsigned long refaults;
-               struct lruvec *lruvec;
+       struct lruvec *target_lruvec;
+       unsigned long refaults;
 
-               lruvec = mem_cgroup_lruvec(pgdat, memcg);
-               refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
-               lruvec->refaults = refaults;
-       } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
+       target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+       refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
+       target_lruvec->refaults = refaults;
 }
 
 /*
@@ -3073,7 +3039,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 retry:
        delayacct_freepages_start();
 
-       if (global_reclaim(sc))
+       if (!cgroup_reclaim(sc))
                __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
 
        do {
@@ -3102,8 +3068,16 @@ retry:
                if (zone->zone_pgdat == last_pgdat)
                        continue;
                last_pgdat = zone->zone_pgdat;
+
                snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
-               set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
+
+               if (cgroup_reclaim(sc)) {
+                       struct lruvec *lruvec;
+
+                       lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
+                                                  zone->zone_pgdat);
+                       clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
+               }
        }
 
        delayacct_freepages_end();
@@ -3115,9 +3089,27 @@ retry:
        if (sc->compaction_ready)
                return 1;
 
+       /*
+        * We make inactive:active ratio decisions based on the node's
+        * composition of memory, but a restrictive reclaim_idx or a
+        * memory.low cgroup setting can exempt large amounts of
+        * memory from reclaim. Neither of which are very common, so
+        * instead of doing costly eligibility calculations of the
+        * entire cgroup subtree up front, we assume the estimates are
+        * good, and retry with forcible deactivation if that fails.
+        */
+       if (sc->skipped_deactivate) {
+               sc->priority = initial_priority;
+               sc->force_deactivate = 1;
+               sc->skipped_deactivate = 0;
+               goto retry;
+       }
+
        /* Untapped cgroup reserves?  Don't OOM, retry. */
        if (sc->memcg_low_skipped) {
                sc->priority = initial_priority;
+               sc->force_deactivate = 0;
+               sc->skipped_deactivate = 0;
                sc->memcg_low_reclaim = 1;
                sc->memcg_low_skipped = 0;
                goto retry;
@@ -3309,6 +3301,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                                                pg_data_t *pgdat,
                                                unsigned long *nr_scanned)
 {
+       struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
        struct scan_control sc = {
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .target_mem_cgroup = memcg,
@@ -3317,7 +3310,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
                .reclaim_idx = MAX_NR_ZONES - 1,
                .may_swap = !noswap,
        };
-       unsigned long lru_pages;
 
        WARN_ON_ONCE(!current->reclaim_state);
 
@@ -3334,7 +3326,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
         * will pick up pages from other mem cgroup's as well. We hack
         * the priority and make it zero.
         */
-       shrink_node_memcg(pgdat, memcg, &sc, &lru_pages);
+       shrink_lruvec(lruvec, &sc);
 
        trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -3348,10 +3340,8 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                                           gfp_t gfp_mask,
                                           bool may_swap)
 {
-       struct zonelist *zonelist;
        unsigned long nr_reclaimed;
        unsigned long pflags;
-       int nid;
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3364,16 +3354,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
                .may_unmap = 1,
                .may_swap = may_swap,
        };
-
-       set_task_reclaim_state(current, &sc.reclaim_state);
        /*
-        * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
-        * take care of from where we get pages. So the node where we start the
-        * scan does not need to be the current node.
+        * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
+        * equal pressure on all the nodes. This is based on the assumption that
+        * the reclaim does not bail out early.
         */
-       nid = mem_cgroup_select_victim_node(memcg);
+       struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 
-       zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
+       set_task_reclaim_state(current, &sc.reclaim_state);
 
        trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
 
@@ -3396,18 +3384,20 @@ static void age_active_anon(struct pglist_data *pgdat,
                                struct scan_control *sc)
 {
        struct mem_cgroup *memcg;
+       struct lruvec *lruvec;
 
        if (!total_swap_pages)
                return;
 
+       lruvec = mem_cgroup_lruvec(NULL, pgdat);
+       if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+               return;
+
        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
-               struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
-
-               if (inactive_list_is_low(lruvec, false, sc, true))
-                       shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
-                                          sc, LRU_ACTIVE_ANON);
-
+               lruvec = mem_cgroup_lruvec(memcg, pgdat);
+               shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+                                  sc, LRU_ACTIVE_ANON);
                memcg = mem_cgroup_iter(NULL, memcg, NULL);
        } while (memcg);
 }
@@ -3475,7 +3465,9 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 /* Clear pgdat state for congested, dirty or under writeback. */
 static void clear_pgdat_congested(pg_data_t *pgdat)
 {
-       clear_bit(PGDAT_CONGESTED, &pgdat->flags);
+       struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
+
+       clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
        clear_bit(PGDAT_DIRTY, &pgdat->flags);
        clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
 }