Merge tag 'mips_fixes_4.17_1' of git://git.kernel.org/pub/scm/linux/kernel/git/jhogan...

[linux-2.6-microblaze.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index cd5dc3f..8b920ce 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -116,6 +116,16 @@ struct scan_control {
  
         /* Number of pages freed so far during a call to shrink_zones() */
         unsigned long nr_reclaimed;
+
+       struct {
+               unsigned int dirty;
+               unsigned int unqueued_dirty;
+               unsigned int congested;
+               unsigned int writeback;
+               unsigned int immediate;
+               unsigned int file_taken;
+               unsigned int taken;
+       } nr;
  };
  
  #ifdef ARCH_HAS_PREFETCH
@@ -190,6 +200,29 @@ static bool sane_reclaim(struct scan_control *sc)
  #endif
         return false;
  }
+
+static void set_memcg_congestion(pg_data_t *pgdat,
+                               struct mem_cgroup *memcg,
+                               bool congested)
+{
+       struct mem_cgroup_per_node *mn;
+
+       if (!memcg)
+               return;
+
+       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+       WRITE_ONCE(mn->congested, congested);
+}
+
+static bool memcg_congested(pg_data_t *pgdat,
+                       struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *mn;
+
+       mn = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
+       return READ_ONCE(mn->congested);
+
+}
  #else
  static bool global_reclaim(struct scan_control *sc)
  {
@@ -200,6 +233,18 @@ static bool sane_reclaim(struct scan_control *sc)
  {
         return true;
  }
+
+static inline void set_memcg_congestion(struct pglist_data *pgdat,
+                               struct mem_cgroup *memcg, bool congested)
+{
+}
+
+static inline bool memcg_congested(struct pglist_data *pgdat,
+                       struct mem_cgroup *memcg)
+{
+       return false;
+
+}
  #endif
  
  /*
@@ -442,16 +487,8 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
         if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg)))
                 return 0;
  
-       if (!down_read_trylock(&shrinker_rwsem)) {
-               /*
-                * If we would return 0, our callers would understand that we
-                * have nothing else to shrink and give up trying. By returning
-                * 1 we keep it going and assume we'll be able to shrink next
-                * time.
-                */
-               freed = 1;
+       if (!down_read_trylock(&shrinker_rwsem))
                 goto out;
-       }
  
         list_for_each_entry(shrinker, &shrinker_list, list) {
                 struct shrink_control sc = {
@@ -656,7 +693,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
  
-       spin_lock_irqsave(&mapping->tree_lock, flags);
+       xa_lock_irqsave(&mapping->i_pages, flags);
         /*
          * The non racy check for a busy page.
          *
@@ -680,7 +717,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
          * load is not satisfied before that of page->_refcount.
          *
          * Note that if SetPageDirty is always performed via set_page_dirty,
-        * and thus under tree_lock, then this ordering is not required.
+        * and thus under the i_pages lock, then this ordering is not required.
          */
         if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
                 refcount = 1 + HPAGE_PMD_NR;
@@ -698,7 +735,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 swp_entry_t swap = { .val = page_private(page) };
                 mem_cgroup_swapout(page, swap);
                 __delete_from_swap_cache(page);
-               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               xa_unlock_irqrestore(&mapping->i_pages, flags);
                 put_swap_page(page, swap);
         } else {
                 void (*freepage)(struct page *);
@@ -719,13 +756,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                  * only page cache pages found in these are zero pages
                  * covering holes, and because we don't want to mix DAX
                  * exceptional entries and shadow exceptional entries in the
-                * same page_tree.
+                * same address_space.
                  */
                 if (reclaimed && page_is_file_cache(page) &&
                     !mapping_exiting(mapping) && !dax_mapping(mapping))
                         shadow = workingset_eviction(mapping, page);
                 __delete_from_page_cache(page, shadow);
-               spin_unlock_irqrestore(&mapping->tree_lock, flags);
+               xa_unlock_irqrestore(&mapping->i_pages, flags);
  
                 if (freepage != NULL)
                         freepage(page);
@@ -734,7 +771,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         return 1;
  
  cannot_free:
-       spin_unlock_irqrestore(&mapping->tree_lock, flags);
+       xa_unlock_irqrestore(&mapping->i_pages, flags);
         return 0;
  }
  
@@ -865,17 +902,6 @@ static void page_check_dirty_writeback(struct page *page,
                 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
  }
  
-struct reclaim_stat {
-       unsigned nr_dirty;
-       unsigned nr_unqueued_dirty;
-       unsigned nr_congested;
-       unsigned nr_writeback;
-       unsigned nr_immediate;
-       unsigned nr_activate;
-       unsigned nr_ref_keep;
-       unsigned nr_unmap_fail;
-};
-
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
@@ -934,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
  
                 /*
-                * The number of dirty pages determines if a zone is marked
+                * The number of dirty pages determines if a node is marked
                  * reclaim_congested which affects wait_iff_congested. kswapd
                  * will stall and start writing pages if the tail of the LRU
                  * is all dirty unqueued pages.
@@ -1762,23 +1788,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         mem_cgroup_uncharge_list(&page_list);
         free_unref_page_list(&page_list);
  
-       /*
-        * If reclaim is isolating dirty pages under writeback, it implies
-        * that the long-lived page allocation rate is exceeding the page
-        * laundering rate. Either the global limits are not being effective
-        * at throttling processes due to the page distribution throughout
-        * zones or there is heavy usage of a slow backing device. The
-        * only option is to throttle from reclaim context which is not ideal
-        * as there is no guarantee the dirtying process is throttled in the
-        * same way balance_dirty_pages() manages.
-        *
-        * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
-        * of pages under pages flagged for immediate reclaim and stall if any
-        * are encountered in the nr_immediate check below.
-        */
-       if (stat.nr_writeback && stat.nr_writeback == nr_taken)
-               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
-
         /*
          * If dirty pages are scanned that are not queued for IO, it
          * implies that flushers are not doing their job. This can
@@ -1793,48 +1802,17 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
         if (stat.nr_unqueued_dirty == nr_taken)
                 wakeup_flusher_threads(WB_REASON_VMSCAN);
  
-       /*
-        * Legacy memcg will stall in page writeback so avoid forcibly
-        * stalling here.
-        */
-       if (sane_reclaim(sc)) {
-               /*
-                * Tag a zone as congested if all the dirty pages scanned were
-                * backed by a congested BDI and wait_iff_congested will stall.
-                */
-               if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested)
-                       set_bit(PGDAT_CONGESTED, &pgdat->flags);
-
-               /* Allow kswapd to start writing pages during reclaim. */
-               if (stat.nr_unqueued_dirty == nr_taken)
-                       set_bit(PGDAT_DIRTY, &pgdat->flags);
-
-               /*
-                * If kswapd scans pages marked marked for immediate
-                * reclaim and under writeback (nr_immediate), it implies
-                * that pages are cycling through the LRU faster than
-                * they are written so also forcibly stall.
-                */
-               if (stat.nr_immediate && current_may_throttle())
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
-       }
-
-       /*
-        * Stall direct reclaim for IO completions if underlying BDIs or zone
-        * is congested. Allow kswapd to continue until it starts encountering
-        * unqueued dirty pages or cycling through the LRU too quickly.
-        */
-       if (!sc->hibernation_mode && !current_is_kswapd() &&
-           current_may_throttle())
-               wait_iff_congested(pgdat, BLK_RW_ASYNC, HZ/10);
+       sc->nr.dirty += stat.nr_dirty;
+       sc->nr.congested += stat.nr_congested;
+       sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
+       sc->nr.writeback += stat.nr_writeback;
+       sc->nr.immediate += stat.nr_immediate;
+       sc->nr.taken += nr_taken;
+       if (file)
+               sc->nr.file_taken += nr_taken;
  
         trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
-                       nr_scanned, nr_reclaimed,
-                       stat.nr_dirty,  stat.nr_writeback,
-                       stat.nr_congested, stat.nr_immediate,
-                       stat.nr_activate, stat.nr_ref_keep,
-                       stat.nr_unmap_fail,
-                       sc->priority, file);
+                       nr_scanned, nr_reclaimed, &stat, sc->priority, file);
         return nr_reclaimed;
  }
  
@@ -2515,6 +2493,12 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         return true;
  }
  
+static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
+{
+       return test_bit(PGDAT_CONGESTED, &pgdat->flags) ||
+               (memcg && memcg_congested(pgdat, memcg));
+}
+
  static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  {
         struct reclaim_state *reclaim_state = current->reclaim_state;
@@ -2530,6 +2514,8 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                 unsigned long node_lru_pages = 0;
                 struct mem_cgroup *memcg;
  
+               memset(&sc->nr, 0, sizeof(sc->nr));
+
                 nr_reclaimed = sc->nr_reclaimed;
                 nr_scanned = sc->nr_scanned;
  
@@ -2544,7 +2530,7 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                                         sc->memcg_low_skipped = 1;
                                         continue;
                                 }
-                               mem_cgroup_event(memcg, MEMCG_LOW);
+                               memcg_memory_event(memcg, MEMCG_LOW);
                         }
  
                         reclaimed = sc->nr_reclaimed;
@@ -2595,6 +2581,67 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
                 if (sc->nr_reclaimed - nr_reclaimed)
                         reclaimable = true;
  
+               if (current_is_kswapd()) {
+                       /*
+                        * If reclaim is isolating dirty pages under writeback,
+                        * it implies that the long-lived page allocation rate
+                        * is exceeding the page laundering rate. Either the
+                        * global limits are not being effective at throttling
+                        * processes due to the page distribution throughout
+                        * zones or there is heavy usage of a slow backing
+                        * device. The only option is to throttle from reclaim
+                        * context which is not ideal as there is no guarantee
+                        * the dirtying process is throttled in the same way
+                        * balance_dirty_pages() manages.
+                        *
+                        * Once a node is flagged PGDAT_WRITEBACK, kswapd will
+                        * count the number of pages under pages flagged for
+                        * immediate reclaim and stall if any are encountered
+                        * in the nr_immediate check below.
+                        */
+                       if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
+                               set_bit(PGDAT_WRITEBACK, &pgdat->flags);
+
+                       /*
+                        * Tag a node as congested if all the dirty pages
+                        * scanned were backed by a congested BDI and
+                        * wait_iff_congested will stall.
+                        */
+                       if (sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+                               set_bit(PGDAT_CONGESTED, &pgdat->flags);
+
+                       /* Allow kswapd to start writing pages during reclaim.*/
+                       if (sc->nr.unqueued_dirty == sc->nr.file_taken)
+                               set_bit(PGDAT_DIRTY, &pgdat->flags);
+
+                       /*
+                        * If kswapd scans pages marked marked for immediate
+                        * reclaim and under writeback (nr_immediate), it
+                        * implies that pages are cycling through the LRU
+                        * faster than they are written so also forcibly stall.
+                        */
+                       if (sc->nr.immediate)
+                               congestion_wait(BLK_RW_ASYNC, HZ/10);
+               }
+
+               /*
+                * Legacy memcg will stall in page writeback so avoid forcibly
+                * stalling in wait_iff_congested().
+                */
+               if (!global_reclaim(sc) && sane_reclaim(sc) &&
+                   sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
+                       set_memcg_congestion(pgdat, root, true);
+
+               /*
+                * Stall direct reclaim for IO completions if underlying BDIs
+                * and node is congested. Allow kswapd to continue until it
+                * starts encountering unqueued dirty pages or cycling through
+                * the LRU too quickly.
+                */
+               if (!sc->hibernation_mode && !current_is_kswapd() &&
+                  current_may_throttle() && pgdat_memcg_congested(pgdat, root))
+                       wait_iff_congested(BLK_RW_ASYNC, HZ/10);
+
         } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
                                          sc->nr_scanned - nr_scanned, sc));
  
@@ -2810,6 +2857,7 @@ retry:
                         continue;
                 last_pgdat = zone->zone_pgdat;
                 snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
+               set_memcg_congestion(last_pgdat, sc->target_mem_cgroup, false);
         }
  
         delayacct_freepages_end();
@@ -3547,16 +3595,21 @@ kswapd_try_sleep:
  }
  
  /*
- * A zone is low on free memory, so wake its kswapd task to service it.
+ * A zone is low on free memory or too fragmented for high-order memory.  If
+ * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
+ * pgdat.  It will wake up kcompactd after reclaiming memory.  If kswapd reclaim
+ * has failed or is not needed, still wake up kcompactd if only compaction is
+ * needed.
   */
-void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
+                  enum zone_type classzone_idx)
  {
         pg_data_t *pgdat;
  
         if (!managed_zone(zone))
                 return;
  
-       if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL))
+       if (!cpuset_zone_allowed(zone, gfp_flags))
                 return;
         pgdat = zone->zone_pgdat;
         pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
@@ -3565,14 +3618,23 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
  
-       /* Hopeless node, leave it to direct reclaim */
-       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
-               return;
-
-       if (pgdat_balanced(pgdat, order, classzone_idx))
+       /* Hopeless node, leave it to direct reclaim if possible */
+       if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
+           pgdat_balanced(pgdat, order, classzone_idx)) {
+               /*
+                * There may be plenty of free memory available, but it's too
+                * fragmented for high-order allocations.  Wake up kcompactd
+                * and rely on compaction_suitable() to determine if it's
+                * needed.  If it fails, it will defer subsequent attempts to
+                * ratelimit its work.
+                */
+               if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
+                       wakeup_kcompactd(pgdat, order, classzone_idx);
                 return;
+       }
  
-       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order);
+       trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+                                     gfp_flags);
         wake_up_interruptible(&pgdat->kswapd_wait);
  }
  
@@ -3802,7 +3864,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in
  
         if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
                 /*
-                * Free memory by calling shrink zone with increasing
+                * Free memory by calling shrink node with increasing
                  * priorities until we have enough memory freed.
                  */
                 do {
@@ -3877,7 +3939,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
   */
  int page_evictable(struct page *page)
  {
-       return !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       int ret;
+
+       /* Prevent address_space of inode and swap cache from being freed */
+       rcu_read_lock();
+       ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
+       rcu_read_unlock();
+       return ret;
  }
  
  #ifdef CONFIG_SHMEM