Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / mm / vmscan.c
index f26b247..74296c2 100644 (file)
@@ -524,13 +524,44 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
 
 static bool can_demote(int nid, struct scan_control *sc)
 {
-       if (sc->no_demotion)
+       if (!numa_demotion_enabled)
                return false;
+       if (sc) {
+               if (sc->no_demotion)
+                       return false;
+               /* It is pointless to do demotion in memcg reclaim */
+               if (cgroup_reclaim(sc))
+                       return false;
+       }
        if (next_demotion_node(nid) == NUMA_NO_NODE)
                return false;
 
-       // FIXME: actually enable this later in the series
-       return false;
+       return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+                                         int nid,
+                                         struct scan_control *sc)
+{
+       if (memcg == NULL) {
+               /*
+                * For non-memcg reclaim, is there
+                * space in any swap device?
+                */
+               if (get_nr_swap_pages() > 0)
+                       return true;
+       } else {
+               /* Is the memcg below its swap limit? */
+               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+                       return true;
+       }
+
+       /*
+        * The page can not be swapped.
+        *
+        * Can it be reclaimed from this node via demotion?
+        */
+       return can_demote(nid, sc);
 }
 
 /*
@@ -544,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
 
        nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                        zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
 
@@ -908,6 +939,7 @@ out:
 void drop_slab_node(int nid)
 {
        unsigned long freed;
+       int shift = 0;
 
        do {
                struct mem_cgroup *memcg = NULL;
@@ -920,7 +952,7 @@ void drop_slab_node(int nid)
                do {
                        freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-       } while (freed > 10);
+       } while ((freed >> shift++) > 1);
 }
 
 void drop_slab(void)
@@ -1318,6 +1350,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
                            target_nid, MIGRATE_ASYNC, MR_DEMOTION,
                            &nr_succeeded);
 
+       if (current_is_kswapd())
+               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+       else
+               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
        return nr_succeeded;
 }
 
@@ -1696,11 +1733,14 @@ retry:
                        /* follow __remove_mapping for reference */
                        if (!page_ref_freeze(page, 1))
                                goto keep_locked;
-                       if (PageDirty(page)) {
-                               page_ref_unfreeze(page, 1);
-                               goto keep_locked;
-                       }
-
+                       /*
+                        * The page has only one reference left, which is
+                        * from the isolation. After the caller puts the
+                        * page back on lru and drops the reference, the
+                        * page will be freed anyway. It doesn't matter
+                        * which lru it goes. So we don't bother checking
+                        * PageDirty here.
+                        */
                        count_vm_event(PGLAZYFREED);
                        count_memcg_page_event(page, PGLAZYFREED);
                } else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1781,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
 {
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_unmap = 1,
        };
        struct reclaim_stat stat;
@@ -2406,7 +2445,6 @@ unsigned long reclaim_pages(struct list_head *page_list)
        unsigned int noreclaim_flag;
        struct scan_control sc = {
                .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                .may_writepage = 1,
                .may_unmap = 1,
                .may_swap = 1,
@@ -2536,6 +2574,7 @@ enum scan_balance {
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                           unsigned long *nr)
 {
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup *memcg = lruvec_memcg(lruvec);
        unsigned long anon_cost, file_cost, total_cost;
        int swappiness = mem_cgroup_swappiness(memcg);
@@ -2546,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
        enum lru_list lru;
 
        /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+       if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                scan_balance = SCAN_FILE;
                goto out;
        }
@@ -2676,7 +2715,7 @@ out:
                        cgroup_size = max(cgroup_size, protection);
 
                        scan = lruvec_size - lruvec_size * protection /
-                               cgroup_size;
+                               (cgroup_size + 1);
 
                        /*
                         * Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2729,6 +2768,21 @@ out:
        }
 }
 
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+                              struct scan_control *sc)
+{
+       /* Aging the anon LRU is valuable if swap is present: */
+       if (total_swap_pages > 0)
+               return true;
+
+       /* Also valuable if anon pages can be demoted: */
+       return can_demote(pgdat->node_id, sc);
+}
+
 static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 {
        unsigned long nr[NR_LRU_LISTS];
@@ -2838,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
         * Even if we did not try to evict anon pages at all, we want to
         * rebalance the anon lru active/inactive ratio.
         */
-       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+       if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+           inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                   sc, LRU_ACTIVE_ANON);
 }
@@ -2908,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
         */
        pages_for_compaction = compact_gap(sc->order);
        inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
 
        return inactive_lru_pages > pages_for_compaction;
@@ -3524,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
         * blocked waiting on the same lock. Instead, throttle for up to a
         * second before continuing.
         */
-       if (!(gfp_mask & __GFP_FS)) {
+       if (!(gfp_mask & __GFP_FS))
                wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                        allow_direct_reclaim(pgdat), HZ);
+       else
+               /* Throttle until kswapd wakes the process */
+               wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                       allow_direct_reclaim(pgdat));
 
-               goto check_pending;
-       }
-
-       /* Throttle until kswapd wakes the process */
-       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
-
-check_pending:
        if (fatal_signal_pending(current))
                return true;
 
@@ -3673,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
-       if (!total_swap_pages)
+       if (!can_age_anon_pages(pgdat, sc))
                return;
 
        lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -4380,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  * This kswapd start function will be called by init and node-hot-add.
  * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
  */
-int kswapd_run(int nid)
+void kswapd_run(int nid)
 {
        pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
 
        if (pgdat->kswapd)
-               return 0;
+               return;
 
        pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
        if (IS_ERR(pgdat->kswapd)) {
                /* failure at boot is fatal */
                BUG_ON(system_state < SYSTEM_RUNNING);
                pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
                pgdat->kswapd = NULL;
        }
-       return ret;
 }
 
 /*