Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / mm / vmscan.c
index 17c4b3f..74296c2 100644 (file)
@@ -41,6 +41,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/memcontrol.h>
+#include <linux/migrate.h>
 #include <linux/delayacct.h>
 #include <linux/sysctl.h>
 #include <linux/oom.h>
@@ -121,6 +122,9 @@ struct scan_control {
        /* The file pages on the current node are dangerously low */
        unsigned int file_is_tiny:1;
 
+       /* Always discard instead of demoting to lower tier memory */
+       unsigned int no_demotion:1;
+
        /* Allocation order */
        s8 order;
 
@@ -518,6 +522,48 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
        return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
 }
 
+static bool can_demote(int nid, struct scan_control *sc)
+{
+       if (!numa_demotion_enabled)
+               return false;
+       if (sc) {
+               if (sc->no_demotion)
+                       return false;
+               /* It is pointless to do demotion in memcg reclaim */
+               if (cgroup_reclaim(sc))
+                       return false;
+       }
+       if (next_demotion_node(nid) == NUMA_NO_NODE)
+               return false;
+
+       return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+                                         int nid,
+                                         struct scan_control *sc)
+{
+       if (memcg == NULL) {
+               /*
+                * For non-memcg reclaim, is there
+                * space in any swap device?
+                */
+               if (get_nr_swap_pages() > 0)
+                       return true;
+       } else {
+               /* Is the memcg below its swap limit? */
+               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+                       return true;
+       }
+
+       /*
+        * The page can not be swapped.
+        *
+        * Can it be reclaimed from this node via demotion?
+        */
+       return can_demote(nid, sc);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -529,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 
        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 
@@ -893,6 +939,7 @@ out:
 void drop_slab_node(int nid)
 {
        unsigned long freed;
+       int shift = 0;
 
        do {
                struct mem_cgroup *memcg = NULL;
@@ -905,7 +952,7 @@ void drop_slab_node(int nid)
                do {
                        freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-       } while (freed > 10);
+       } while ((freed >> shift++) > 1);
 }
 
 void drop_slab(void)
@@ -1052,14 +1099,13 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 static int __remove_mapping(struct address_space *mapping, struct page *page,
                            bool reclaimed, struct mem_cgroup *target_memcg)
 {
-       unsigned long flags;
        int refcount;
        void *shadow = NULL;
 
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       xa_lock_irqsave(&mapping->i_pages, flags);
+       xa_lock_irq(&mapping->i_pages);
        /*
         * The non racy check for a busy page.
         *
@@ -1100,7 +1146,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                if (reclaimed && !mapping_exiting(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_swap_cache(page, swap, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
                put_swap_page(page, swap);
        } else {
                void (*freepage)(struct page *);
@@ -1126,7 +1172,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
                    !mapping_exiting(mapping) && !dax_mapping(mapping))
                        shadow = workingset_eviction(page, target_memcg);
                __delete_from_page_cache(page, shadow);
-               xa_unlock_irqrestore(&mapping->i_pages, flags);
+               xa_unlock_irq(&mapping->i_pages);
 
                if (freepage != NULL)
                        freepage(page);
@@ -1135,7 +1181,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
        return 1;
 
 cannot_free:
-       xa_unlock_irqrestore(&mapping->i_pages, flags);
+       xa_unlock_irq(&mapping->i_pages);
        return 0;
 }
 
@@ -1264,6 +1310,54 @@ static void page_check_dirty_writeback(struct page *page,
                mapping->a_ops->is_dirty_writeback(page, dirty, writeback);
 }
 
+static struct page *alloc_demote_page(struct page *page, unsigned long node)
+{
+       struct migration_target_control mtc = {
+               /*
+                * Allocate from 'node', or fail quickly and quietly.
+                * When this happens, 'page' will likely just be discarded
+                * instead of migrated.
+                */
+               .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) |
+                           __GFP_THISNODE  | __GFP_NOWARN |
+                           __GFP_NOMEMALLOC | GFP_NOWAIT,
+               .nid = node
+       };
+
+       return alloc_migration_target(page, (unsigned long)&mtc);
+}
+
+/*
+ * Take pages on @demote_list and attempt to demote them to
+ * another node.  Pages which are not demoted are left on
+ * @demote_pages.
+ */
+static unsigned int demote_page_list(struct list_head *demote_pages,
+                                    struct pglist_data *pgdat)
+{
+       int target_nid = next_demotion_node(pgdat->node_id);
+       unsigned int nr_succeeded;
+       int err;
+
+       if (list_empty(demote_pages))
+               return 0;
+
+       if (target_nid == NUMA_NO_NODE)
+               return 0;
+
+       /* Demotion ignores all cpuset and mempolicy settings */
+       err = migrate_pages(demote_pages, alloc_demote_page, NULL,
+                           target_nid, MIGRATE_ASYNC, MR_DEMOTION,
+                           &nr_succeeded);
+
+       if (current_is_kswapd())
+               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+       else
+               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
+       return nr_succeeded;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1275,12 +1369,16 @@ static unsigned int shrink_page_list(struct list_head *page_list,
 {
        LIST_HEAD(ret_pages);
        LIST_HEAD(free_pages);
+       LIST_HEAD(demote_pages);
        unsigned int nr_reclaimed = 0;
        unsigned int pgactivate = 0;
+       bool do_demote_pass;
 
        memset(stat, 0, sizeof(*stat));
        cond_resched();
+       do_demote_pass = can_demote(pgdat->node_id, sc);
 
+retry:
        while (!list_empty(page_list)) {
                struct address_space *mapping;
                struct page *page;
@@ -1429,6 +1527,17 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                        ; /* try to reclaim the page below */
                }
 
+               /*
+                * Before reclaiming the page, try to relocate
+                * its contents to another node.
+                */
+               if (do_demote_pass &&
+                   (thp_migration_supported() || !PageTransHuge(page))) {
+                       list_add(&page->lru, &demote_pages);
+                       unlock_page(page);
+                       continue;
+               }
+
                /*
                 * Anonymous process memory has backing store?
                 * Try to allocate it some swap space here.
@@ -1624,11 +1733,14 @@ static unsigned int shrink_page_list(struct list_head *page_list,
                        /* follow __remove_mapping for reference */
                        if (!page_ref_freeze(page, 1))
                                goto keep_locked;
-                       if (PageDirty(page)) {
-                               page_ref_unfreeze(page, 1);
-                               goto keep_locked;
-                       }
-
+                       /*
+                        * The page has only one reference left, which is
+                        * from the isolation. After the caller puts the
+                        * page back on lru and drops the reference, the
+                        * page will be freed anyway. It doesn't matter
+                        * which lru it goes. So we don't bother checking
+                        * PageDirty here.
+                        */
                        count_vm_event(PGLAZYFREED);
                        count_memcg_page_event(page, PGLAZYFREED);
                } else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1680,6 +1792,17 @@ keep:
                list_add(&page->lru, &ret_pages);
                VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
        }
+       /* 'page_list' is always empty here */
+
+       /* Migrate pages selected for demotion */
+       nr_reclaimed += demote_page_list(&demote_pages, pgdat);
+       /* Pages that could not be demoted are still in @demote_pages */
+       if (!list_empty(&demote_pages)) {
+               /* Pages which failed to demoted go back on @page_list for retry: */
+               list_splice_init(&demote_pages, page_list);
+               do_demote_pass = false;
+               goto retry;
+       }
 
        pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
 
@@ -1698,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 {
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
        struct reclaim_stat stat;
@@ -2323,10 +2445,10 @@ unsigned long reclaim_pages(struct list_head *page_list)
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
+               .no_demotion = 1,
        };
 
        noreclaim_flag = memalloc_noreclaim_save();
@@ -2452,6 +2574,7 @@ enum scan_balance {
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        unsigned long anon_cost, file_cost, total_cost;
        int swappiness = mem_cgroup_swappiness(memcg);
@@ -2462,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        enum lru_list lru;
 
        /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+       if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2592,7 +2715,7 @@ out:
                        cgroup_size = max(cgroup_size, protection);
 
                        scan = lruvec_size - lruvec_size * protection /
-                               cgroup_size;
+                               (cgroup_size + 1);
 
                        /*
                         * Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2645,6 +2768,21 @@ out:
        }
 }
 
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+                              struct scan_control *sc)
+{
+       /* Aging the anon LRU is valuable if swap is present: */
+       if (total_swap_pages > 0)
+               return true;
+
+       /* Also valuable if anon pages can be demoted: */
+       return can_demote(pgdat->node_id, sc);
+}
+
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
@@ -2754,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+       if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+           inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2824,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         */
        pages_for_compaction = compact_gap(sc->order);
        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
 
        return inactive_lru_pages > pages_for_compaction;
@@ -2898,6 +3037,12 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
        target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
 
 again:
+       /*
+        * Flush the memory cgroup stats, so that we read accurate per-memcg
+        * lruvec stats for heuristics.
+        */
+       mem_cgroup_flush_stats();
+
        memset(&sc->nr, 0, sizeof(sc->nr));
 
        nr_reclaimed = sc->nr_reclaimed;
@@ -3434,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * blocked waiting on the same lock. Instead, throttle for up to a
         * second before continuing.
         */
-       if (!(gfp_mask & __GFP_FS)) {
+       if (!(gfp_mask & __GFP_FS))
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat), HZ);
+       else
+               /* Throttle until kswapd wakes the process */
+               wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                       allow_direct_reclaim(pgdat));
 
-               goto check_pending;
-       }
-
-       /* Throttle until kswapd wakes the process */
-       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
-
-check_pending:
        if (fatal_signal_pending(current))
                return true;
 
@@ -3583,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
-       if (!total_swap_pages)
+       if (!can_age_anon_pages(pgdat, sc))
                return;
 
        lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -4290,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
-int kswapd_run(int nid)
+void kswapd_run(int nid)
 {
        pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
 
        if (pgdat->kswapd)
-               return 0;
+               return;
 
        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state < SYSTEM_RUNNING);
                pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
                pgdat->kswapd = NULL;
        }
-       return ret;
 }
 
 /*