Merge tag 'mips_fixes_4.17_1' of git://git.kernel.org/pub/scm/linux/kernel/git/jhogan...
[linux-2.6-microblaze.git] / mm / vmscan.c
index cd5dc3f..8b920ce 100644 (file)
@@ -116,6 +116,16 @@ struct scan_control {
 
        /* Number of pages freed so far during a call to shrink_zones() */
        unsigned long nr_reclaimed;
+
+       struct {
+               unsigned int dirty;
+               unsigned int unqueued_dirty;
+               unsigned int congested;
+               unsigned int writeback;
+               unsigned int immediate;
+               unsigned int file_taken;
+               unsigned int taken;
+       } nr;
 };
 
 #ifdef ARCH_HAS_PREFETCH
@@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc)
 #endif
        return false;
 }
+
+static void set_memcg_congestion(pg_data_t *pgdat,
+                               struct mem_cgroup *memcg,
+                               bool congested)
+{
+       struct mem_cgroup_per_node *mn;
+
+       if (!memcg)
+               return;
+
+       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+       WRITE_ONCE(mn->congested, congested);
+}
+
+static bool memcg_congested(pg_data_t *pgdat,
+                       struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *mn;
+
+       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+       return READ_ONCE(mn->congested);
+
+}
 #else
 static bool global_reclaim(struct scan_control *sc)
 {
@@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc)
 {
        return true;
 }
+
+static inline void set_memcg_congestion(struct pglist_data *pgdat,
+                               struct mem_cgroup *memcg, bool congested)
+{
+}
+
+static inline bool memcg_congested(struct pglist_data *pgdat,
+                       struct mem_cgroup *memcg)
+{
+       return false;
+
+}
 #endif
 
 /*
@@ -442,16 +487,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
        if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
                return 0;
 
-       if (!down_read_trylock(&shrinker_rwsem)) {
-               /*
-                * If we would return 0, our callers would understand that we
-                * have nothing else to shrink and give up trying. By returning
-                * 1 we keep it going and assume we'll be able to shrink next
-                * time.
-                */
-               freed = 1;
+       if (!down_read_trylock(&shrinker_rwsem))
                goto out;
-       }
 
        list_for_each_entry(shrinker, &shrinker_list, list) {
                struct shrink_control sc = {
@@ -656,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       spin_lock_irqsave(&mapping->tree_lock, flags);
+       xa_lock_irqsave(&mapping->i_pages, flags);
        /*
         * The non racy check for a busy page.
         *
@@ -680,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         * load is not satisfied before that of page->_refcount.
         *
         * Note that if SetPageDirty is always performed via set_page_dirty,
-        * and thus under tree_lock, then this ordering is not required.
+        * and thus under the i_pages lock, then this ordering is not required.
         */
        if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
                refcount = 1 + HPAGE_PMD_NR;
@@ -698,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                swp_entry_t swap = { .val = page_private(page) };
                mem_cgroup_swapout(page, swap);
                __delete_from_swap_cache(page);
-               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               xa_unlock_irqrestore(&mapping->i_pages, flags);
                put_swap_page(page, swap);
        } else {
                void (*freepage)(struct page *);
@@ -719,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 * only page cache pages found in these are zero pages
                 * covering holes, and because we don't want to mix DAX
                 * exceptional entries and shadow exceptional entries in the
-                * same page_tree.
+                * same address_space.
                 */
                if (reclaimed && page_is_file_cache(page) &&
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(mapping, page);
                __delete_from_page_cache(page, shadow);
-               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               xa_unlock_irqrestore(&mapping->i_pages, flags);
 
                if (freepage != NULL)
                        freepage(page);
@@ -734,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        return 1;
 
 cannot_free:
-       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       xa_unlock_irqrestore(&mapping->i_pages, flags);
        return 0;
 }
 
@@ -865,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page,
                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
-struct reclaim_stat {
-       unsigned nr_dirty;
-       unsigned nr_unqueued_dirty;
-       unsigned nr_congested;
-       unsigned nr_writeback;
-       unsigned nr_immediate;
-       unsigned nr_activate;
-       unsigned nr_ref_keep;
-       unsigned nr_unmap_fail;
-};
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -934,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                        (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
                /*
-                * The number of dirty pages determines if a zone is marked
+                * The number of dirty pages determines if a node is marked
                 * reclaim_congested which affects wait_iff_congested. kswapd
                 * will stall and start writing pages if the tail of the LRU
                 * is all dirty unqueued pages.
@@ -1762,23 +1788,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        mem_cgroup_uncharge_list(&page_list);
        free_unref_page_list(&page_list);
 
-       /*
-        * If reclaim is isolating dirty pages under writeback, it implies
-        * that the long-lived page allocation rate is exceeding the page
-        * laundering rate. Either the global limits are not being effective
-        * at throttling processes due to the page distribution throughout
-        * zones or there is heavy usage of a slow backing device. The
-        * only option is to throttle from reclaim context which is not ideal
-        * as there is no guarantee the dirtying process is throttled in the
-        * same way balance_dirty_pages() manages.
-        *
-        * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
-        * of pages under pages flagged for immediate reclaim and stall if any
-        * are encountered in the nr_immediate check below.
-        */
-       if (stat.nr_writeback && stat.nr_writeback == nr_taken)
-               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
-
        /*
         * If dirty pages are scanned that are not queued for IO, it
         * implies that flushers are not doing their job. This can
@@ -1793,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
        if (stat.nr_unqueued_dirty == nr_taken)
                wakeup_flusher_threads(WB_REASON_VMSCAN);
 
-       /*
-        * Legacy memcg will stall in page writeback so avoid forcibly
-        * stalling here.
-        */
-       if (sane_reclaim(sc)) {
-               /*
-                * Tag a zone as congested if all the dirty pages scanned were
-                * backed by a congested BDI and wait_iff_congested will stall.
-                */
-               if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
-                       set_bit(PGDAT_CONGESTED, &pgdat->flags);
-
-               /* Allow kswapd to start writing pages during reclaim. */
-               if (stat.nr_unqueued_dirty == nr_taken)
-                       set_bit(PGDAT_DIRTY, &pgdat->flags);
-
-               /*
-                * If kswapd scans pages marked marked for immediate
-                * reclaim and under writeback (nr_immediate), it implies
-                * that pages are cycling through the LRU faster than
-                * they are written so also forcibly stall.
-                */
-               if (stat.nr_immediate && current_may_throttle())
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
-       }
-
-       /*
-        * Stall direct reclaim for IO completions if underlying BDIs or zone
-        * is congested. Allow kswapd to continue until it starts encountering
-        * unqueued dirty pages or cycling through the LRU too quickly.
-        */
-       if (!sc->hibernation_mode && !current_is_kswapd() &&
-           current_may_throttle())
-               wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+       sc->nr.dirty += stat.nr_dirty;
+       sc->nr.congested += stat.nr_congested;
+       sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+       sc->nr.writeback += stat.nr_writeback;
+       sc->nr.immediate += stat.nr_immediate;
+       sc->nr.taken += nr_taken;
+       if (file)
+               sc->nr.file_taken += nr_taken;
 
        trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
-                       nr_scanned, nr_reclaimed,
-                       stat.nr_dirty,  stat.nr_writeback,
-                       stat.nr_congested, stat.nr_immediate,
-                       stat.nr_activate, stat.nr_ref_keep,
-                       stat.nr_unmap_fail,
-                       sc->priority, file);
+                       nr_scanned, nr_reclaimed, &stat, sc->priority, file);
        return nr_reclaimed;
 }
 
@@ -2515,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
        return true;
 }
 
+static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+{
+       return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
+               (memcg && memcg_congested(pgdat, memcg));
+}
+
 static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
 {
        struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2530,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                unsigned long node_lru_pages = 0;
                struct mem_cgroup *memcg;
 
+               memset(&sc->nr, 0, sizeof(sc->nr));
+
                nr_reclaimed = sc->nr_reclaimed;
                nr_scanned = sc->nr_scanned;
 
@@ -2544,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                                        sc->memcg_low_skipped = 1;
                                        continue;
                                }
-                               mem_cgroup_event(memcg, MEMCG_LOW);
+                               memcg_memory_event(memcg, MEMCG_LOW);
                        }
 
                        reclaimed = sc->nr_reclaimed;
@@ -2595,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                if (sc->nr_reclaimed - nr_reclaimed)
                        reclaimable = true;
 
+               if (current_is_kswapd()) {
+                       /*
+                        * If reclaim is isolating dirty pages under writeback,
+                        * it implies that the long-lived page allocation rate
+                        * is exceeding the page laundering rate. Either the
+                        * global limits are not being effective at throttling
+                        * processes due to the page distribution throughout
+                        * zones or there is heavy usage of a slow backing
+                        * device. The only option is to throttle from reclaim
+                        * context which is not ideal as there is no guarantee
+                        * the dirtying process is throttled in the same way
+                        * balance_dirty_pages() manages.
+                        *
+                        * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+                        * count the number of pages under pages flagged for
+                        * immediate reclaim and stall if any are encountered
+                        * in the nr_immediate check below.
+                        */
+                       if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+                               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+                       /*
+                        * Tag a node as congested if all the dirty pages
+                        * scanned were backed by a congested BDI and
+                        * wait_iff_congested will stall.
+                        */
+                       if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+                               set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+                       /* Allow kswapd to start writing pages during reclaim.*/
+                       if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+                               set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+                       /*
+                        * If kswapd scans pages marked marked for immediate
+                        * reclaim and under writeback (nr_immediate), it
+                        * implies that pages are cycling through the LRU
+                        * faster than they are written so also forcibly stall.
+                        */
+                       if (sc->nr.immediate)
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+               }
+
+               /*
+                * Legacy memcg will stall in page writeback so avoid forcibly
+                * stalling in wait_iff_congested().
+                */
+               if (!global_reclaim(sc) && sane_reclaim(sc) &&
+                   sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+                       set_memcg_congestion(pgdat, root, true);
+
+               /*
+                * Stall direct reclaim for IO completions if underlying BDIs
+                * and node is congested. Allow kswapd to continue until it
+                * starts encountering unqueued dirty pages or cycling through
+                * the LRU too quickly.
+                */
+               if (!sc->hibernation_mode && !current_is_kswapd() &&
+                  current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+                       wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
        } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                         sc->nr_scanned - nr_scanned, sc));
 
@@ -2810,6 +2857,7 @@ retry:
                        continue;
                last_pgdat = zone->zone_pgdat;
                snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+               set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
        }
 
        delayacct_freepages_end();
@@ -3547,16 +3595,21 @@ kswapd_try_sleep:
 }
 
 /*
- * A zone is low on free memory, so wake its kswapd task to service it.
+ * A zone is low on free memory or too fragmented for high-order memory.  If
+ * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
+ * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
+ * has failed or is not needed, still wake up kcompactd if only compaction is
+ * needed.
  */
-void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+                  enum zone_type classzone_idx)
 {
        pg_data_t *pgdat;
 
        if (!managed_zone(zone))
                return;
 
-       if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
+       if (!cpuset_zone_allowed(zone, gfp_flags))
                return;
        pgdat = zone->zone_pgdat;
        pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3565,14 +3618,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
        if (!waitqueue_active(&pgdat->kswapd_wait))
                return;
 
-       /* Hopeless node, leave it to direct reclaim */
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return;
-
-       if (pgdat_balanced(pgdat, order, classzone_idx))
+       /* Hopeless node, leave it to direct reclaim if possible */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+           pgdat_balanced(pgdat, order, classzone_idx)) {
+               /*
+                * There may be plenty of free memory available, but it's too
+                * fragmented for high-order allocations.  Wake up kcompactd
+                * and rely on compaction_suitable() to determine if it's
+                * needed.  If it fails, it will defer subsequent attempts to
+                * ratelimit its work.
+                */
+               if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
+                       wakeup_kcompactd(pgdat, order, classzone_idx);
                return;
+       }
 
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+                                     gfp_flags);
        wake_up_interruptible(&pgdat->kswapd_wait);
 }
 
@@ -3802,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
 
        if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                /*
-                * Free memory by calling shrink zone with increasing
+                * Free memory by calling shrink node with increasing
                 * priorities until we have enough memory freed.
                 */
                do {
@@ -3877,7 +3939,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  */
 int page_evictable(struct page *page)
 {
-       return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       int ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       rcu_read_unlock();
+       return ret;
 }
 
 #ifdef CONFIG_SHMEM