Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block

[linux-2.6-microblaze.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 17c4b3f..74296c2 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -41,6 +41,7 @@
  #include <linux/kthread.h>
  #include <linux/freezer.h>
  #include <linux/memcontrol.h>
+#include <linux/migrate.h>
  #include <linux/delayacct.h>
  #include <linux/sysctl.h>
  #include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
         /* The file pages on the current node are dangerously low */
         unsigned int file_is_tiny:1;
  
+       /* Always discard instead of demoting to lower tier memory */
+       unsigned int no_demotion:1;
+
         /* Allocation order */
         s8 order;
  
@@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
         return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
  }
  
+static bool can_demote(int nid, struct scan_control *sc)
+{
+       if (!numa_demotion_enabled)
+               return false;
+       if (sc) {
+               if (sc->no_demotion)
+                       return false;
+               /* It is pointless to do demotion in memcg reclaim */
+               if (cgroup_reclaim(sc))
+                       return false;
+       }
+       if (next_demotion_node(nid) == NUMA_NO_NODE)
+               return false;
+
+       return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+                                         int nid,
+                                         struct scan_control *sc)
+{
+       if (memcg == NULL) {
+               /*
+                * For non-memcg reclaim, is there
+                * space in any swap device?
+                */
+               if (get_nr_swap_pages() > 0)
+                       return true;
+       } else {
+               /* Is the memcg below its swap limit? */
+               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+                       return true;
+       }
+
+       /*
+        * The page can not be swapped.
+        *
+        * Can it be reclaimed from this node via demotion?
+        */
+       return can_demote(nid, sc);
+}
+
  /*
   * This misses isolated pages which are not accounted for to save counters.
   * As the data only determines if reclaim or compaction continues, it is
@@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  
         nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                         zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  
@@ -893,6 +939,7 @@ out:
  void drop_slab_node(int nid)
  {
         unsigned long freed;
+       int shift = 0;
  
         do {
                 struct mem_cgroup *memcg = NULL;
@@ -905,7 +952,7 @@ void drop_slab_node(int nid)
                 do {
                         freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-       } while (freed > 10);
+       } while ((freed >> shift++) > 1);
  }
  
  void drop_slab(void)
@@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
  static int __remove_mapping(struct address_space *mapping, struct page *page,
                             bool reclaimed, struct mem_cgroup *target_memcg)
  {
-       unsigned long flags;
         int refcount;
         void *shadow = NULL;
  
         BUG_ON(!PageLocked(page));
         BUG_ON(mapping != page_mapping(page));
  
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
         /*
          * The non racy check for a busy page.
          *
@@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                 if (reclaimed && !mapping_exiting(mapping))
                         shadow = workingset_eviction(page, target_memcg);
                 __delete_from_swap_cache(page, swap, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
                 put_swap_page(page, swap);
         } else {
                 void (*freepage)(struct page *);
@@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                     !mapping_exiting(mapping) && !dax_mapping(mapping))
                         shadow = workingset_eviction(page, target_memcg);
                 __delete_from_page_cache(page, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
  
                 if (freepage != NULL)
                         freepage(page);
@@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
         return 1;
  
  cannot_free:
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
         return 0;
  }
  
@@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page,
                 mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
  }
  
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+       struct migration_target_control mtc = {
+               /*
+                * Allocate from 'node', or fail quickly and quietly.
+                * When this happens, 'page' will likely just be discarded
+                * instead of migrated.
+                */
+               .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+                           __GFP_THISNODE  | __GFP_NOWARN |
+                           __GFP_NOMEMALLOC | GFP_NOWAIT,
+               .nid = node
+       };
+
+       return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node.  Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+                                    struct pglist_data *pgdat)
+{
+       int target_nid = next_demotion_node(pgdat->node_id);
+       unsigned int nr_succeeded;
+       int err;
+
+       if (list_empty(demote_pages))
+               return 0;
+
+       if (target_nid == NUMA_NO_NODE)
+               return 0;
+
+       /* Demotion ignores all cpuset and mempolicy settings */
+       err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+                           target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+                           &nr_succeeded);
+
+       if (current_is_kswapd())
+               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+       else
+               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+       return nr_succeeded;
+}
+
  /*
   * shrink_page_list() returns the number of reclaimed pages
   */
@@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
  {
         LIST_HEAD(ret_pages);
         LIST_HEAD(free_pages);
+       LIST_HEAD(demote_pages);
         unsigned int nr_reclaimed = 0;
         unsigned int pgactivate = 0;
+       bool do_demote_pass;
  
         memset(stat, 0, sizeof(*stat));
         cond_resched();
+       do_demote_pass = can_demote(pgdat->node_id, sc);
  
+retry:
         while (!list_empty(page_list)) {
                 struct address_space *mapping;
                 struct page *page;
@@ -1429,6 +1527,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                         ; /* try to reclaim the page below */
                 }
  
+               /*
+                * Before reclaiming the page, try to relocate
+                * its contents to another node.
+                */
+               if (do_demote_pass &&
+                   (thp_migration_supported() || !PageTransHuge(page))) {
+                       list_add(&page->lru, &demote_pages);
+                       unlock_page(page);
+                       continue;
+               }
+
                 /*
                  * Anonymous process memory has backing store?
                  * Try to allocate it some swap space here.
@@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                         /* follow __remove_mapping for reference */
                         if (!page_ref_freeze(page, 1))
                                 goto keep_locked;
-                       if (PageDirty(page)) {
-                               page_ref_unfreeze(page, 1);
-                               goto keep_locked;
-                       }
-
+                       /*
+                        * The page has only one reference left, which is
+                        * from the isolation. After the caller puts the
+                        * page back on lru and drops the reference, the
+                        * page will be freed anyway. It doesn't matter
+                        * which lru it goes. So we don't bother checking
+                        * PageDirty here.
+                        */
                         count_vm_event(PGLAZYFREED);
                         count_memcg_page_event(page, PGLAZYFREED);
                 } else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1680,6 +1792,17 @@ keep:
                 list_add(&page->lru, &ret_pages);
                 VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
         }
+       /* 'page_list' is always empty here */
+
+       /* Migrate pages selected for demotion */
+       nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+       /* Pages that could not be demoted are still in @demote_pages */
+       if (!list_empty(&demote_pages)) {
+               /* Pages which failed to demoted go back on @page_list for retry: */
+               list_splice_init(&demote_pages, page_list);
+               do_demote_pass = false;
+               goto retry;
+       }
  
         pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
  
@@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  {
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                 .may_unmap = 1,
         };
         struct reclaim_stat stat;
@@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
         unsigned int noreclaim_flag;
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                 .may_writepage = 1,
                 .may_unmap = 1,
                 .may_swap = 1,
+               .no_demotion = 1,
         };
  
         noreclaim_flag = memalloc_noreclaim_save();
@@ -2452,6 +2574,7 @@ enum scan_balance {
  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                            unsigned long *nr)
  {
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
         unsigned long anon_cost, file_cost, total_cost;
         int swappiness = mem_cgroup_swappiness(memcg);
@@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         enum lru_list lru;
  
         /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+       if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                 scan_balance = SCAN_FILE;
                 goto out;
         }
@@ -2592,7 +2715,7 @@ out:
                         cgroup_size = max(cgroup_size, protection);
  
                         scan = lruvec_size - lruvec_size * protection /
-                               cgroup_size;
+                               (cgroup_size + 1);
  
                         /*
                          * Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2645,6 +2768,21 @@ out:
         }
  }
  
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+                              struct scan_control *sc)
+{
+       /* Aging the anon LRU is valuable if swap is present: */
+       if (total_swap_pages > 0)
+               return true;
+
+       /* Also valuable if anon pages can be demoted: */
+       return can_demote(pgdat->node_id, sc);
+}
+
  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  {
         unsigned long nr[NR_LRU_LISTS];
@@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+       if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+           inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
  }
@@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
          */
         pages_for_compaction = compact_gap(sc->order);
         inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
  
         return inactive_lru_pages > pages_for_compaction;
@@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
         target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  
  again:
+       /*
+        * Flush the memory cgroup stats, so that we read accurate per-memcg
+        * lruvec stats for heuristics.
+        */
+       mem_cgroup_flush_stats();
+
         memset(&sc->nr, 0, sizeof(sc->nr));
  
         nr_reclaimed = sc->nr_reclaimed;
@@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          * blocked waiting on the same lock. Instead, throttle for up to a
          * second before continuing.
          */
-       if (!(gfp_mask & __GFP_FS)) {
+       if (!(gfp_mask & __GFP_FS))
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                         allow_direct_reclaim(pgdat), HZ);
+       else
+               /* Throttle until kswapd wakes the process */
+               wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                       allow_direct_reclaim(pgdat));
  
-               goto check_pending;
-       }
-
-       /* Throttle until kswapd wakes the process */
-       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
-
-check_pending:
         if (fatal_signal_pending(current))
                 return true;
  
@@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
         struct mem_cgroup *memcg;
         struct lruvec *lruvec;
  
-       if (!total_swap_pages)
+       if (!can_age_anon_pages(pgdat, sc))
                 return;
  
         lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
   */
-int kswapd_run(int nid)
+void kswapd_run(int nid)
  {
         pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
  
         if (pgdat->kswapd)
-               return 0;
+               return;
  
         pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
         if (IS_ERR(pgdat->kswapd)) {
                 /* failure at boot is fatal */
                 BUG_ON(system_state < SYSTEM_RUNNING);
                 pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
                 pgdat->kswapd = NULL;
         }
-       return ret;
  }
  
  /*