Merge tag 'io_uring-5.15-2021-09-11' of git://git.kernel.dk/linux-block

[linux-2.6-microblaze.git] / mm / vmscan.c
diff --git a/mm/vmscan.c b/mm/vmscan.c

index f26b247..74296c2 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -524,13 +524,44 @@ static long add_nr_deferred(long nr, struct shrinker *shrinker,
  
  static bool can_demote(int nid, struct scan_control *sc)
  {
-       if (sc->no_demotion)
+       if (!numa_demotion_enabled)
                 return false;
+       if (sc) {
+               if (sc->no_demotion)
+                       return false;
+               /* It is pointless to do demotion in memcg reclaim */
+               if (cgroup_reclaim(sc))
+                       return false;
+       }
         if (next_demotion_node(nid) == NUMA_NO_NODE)
                 return false;
  
-       // FIXME: actually enable this later in the series
-       return false;
+       return true;
+}
+
+static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
+                                         int nid,
+                                         struct scan_control *sc)
+{
+       if (memcg == NULL) {
+               /*
+                * For non-memcg reclaim, is there
+                * space in any swap device?
+                */
+               if (get_nr_swap_pages() > 0)
+                       return true;
+       } else {
+               /* Is the memcg below its swap limit? */
+               if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
+                       return true;
+       }
+
+       /*
+        * The page can not be swapped.
+        *
+        * Can it be reclaimed from this node via demotion?
+        */
+       return can_demote(nid, sc);
  }
  
  /*
@@ -544,7 +575,7 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
  
         nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
                 zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
                 nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
                         zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  
@@ -908,6 +939,7 @@ out:
  void drop_slab_node(int nid)
  {
         unsigned long freed;
+       int shift = 0;
  
         do {
                 struct mem_cgroup *memcg = NULL;
@@ -920,7 +952,7 @@ void drop_slab_node(int nid)
                 do {
                         freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
                 } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
-       } while (freed > 10);
+       } while ((freed >> shift++) > 1);
  }
  
  void drop_slab(void)
@@ -1318,6 +1350,11 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
                             target_nid, MIGRATE_ASYNC, MR_DEMOTION,
                             &nr_succeeded);
  
+       if (current_is_kswapd())
+               __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
+       else
+               __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+
         return nr_succeeded;
  }
  
@@ -1696,11 +1733,14 @@ retry:
                         /* follow __remove_mapping for reference */
                         if (!page_ref_freeze(page, 1))
                                 goto keep_locked;
-                       if (PageDirty(page)) {
-                               page_ref_unfreeze(page, 1);
-                               goto keep_locked;
-                       }
-
+                       /*
+                        * The page has only one reference left, which is
+                        * from the isolation. After the caller puts the
+                        * page back on lru and drops the reference, the
+                        * page will be freed anyway. It doesn't matter
+                        * which lru it goes. So we don't bother checking
+                        * PageDirty here.
+                        */
                         count_vm_event(PGLAZYFREED);
                         count_memcg_page_event(page, PGLAZYFREED);
                 } else if (!mapping || !__remove_mapping(mapping, page, true,
@@ -1781,7 +1821,6 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  {
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                 .may_unmap = 1,
         };
         struct reclaim_stat stat;
@@ -2406,7 +2445,6 @@ unsigned long reclaim_pages(struct list_head *page_list)
         unsigned int noreclaim_flag;
         struct scan_control sc = {
                 .gfp_mask = GFP_KERNEL,
-               .priority = DEF_PRIORITY,
                 .may_writepage = 1,
                 .may_unmap = 1,
                 .may_swap = 1,
@@ -2536,6 +2574,7 @@ enum scan_balance {
  static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
                            unsigned long *nr)
  {
+       struct pglist_data *pgdat = lruvec_pgdat(lruvec);
         struct mem_cgroup *memcg = lruvec_memcg(lruvec);
         unsigned long anon_cost, file_cost, total_cost;
         int swappiness = mem_cgroup_swappiness(memcg);
@@ -2546,7 +2585,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
         enum lru_list lru;
  
         /* If we have no swap space, do not bother scanning anon pages. */
-       if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) {
+       if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
                 scan_balance = SCAN_FILE;
                 goto out;
         }
@@ -2676,7 +2715,7 @@ out:
                         cgroup_size = max(cgroup_size, protection);
  
                         scan = lruvec_size - lruvec_size * protection /
-                               cgroup_size;
+                               (cgroup_size + 1);
  
                         /*
                          * Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2729,6 +2768,21 @@ out:
         }
  }
  
+/*
+ * Anonymous LRU management is a waste if there is
+ * ultimately no way to reclaim the memory.
+ */
+static bool can_age_anon_pages(struct pglist_data *pgdat,
+                              struct scan_control *sc)
+{
+       /* Aging the anon LRU is valuable if swap is present: */
+       if (total_swap_pages > 0)
+               return true;
+
+       /* Also valuable if anon pages can be demoted: */
+       return can_demote(pgdat->node_id, sc);
+}
+
  static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  {
         unsigned long nr[NR_LRU_LISTS];
@@ -2838,7 +2892,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
          * Even if we did not try to evict anon pages at all, we want to
          * rebalance the anon lru active/inactive ratio.
          */
-       if (total_swap_pages && inactive_is_low(lruvec, LRU_INACTIVE_ANON))
+       if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
+           inactive_is_low(lruvec, LRU_INACTIVE_ANON))
                 shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
                                    sc, LRU_ACTIVE_ANON);
  }
@@ -2908,7 +2963,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat,
          */
         pages_for_compaction = compact_gap(sc->order);
         inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
-       if (get_nr_swap_pages() > 0)
+       if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
                 inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
  
         return inactive_lru_pages > pages_for_compaction;
@@ -3524,18 +3579,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
          * blocked waiting on the same lock. Instead, throttle for up to a
          * second before continuing.
          */
-       if (!(gfp_mask & __GFP_FS)) {
+       if (!(gfp_mask & __GFP_FS))
                 wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
                         allow_direct_reclaim(pgdat), HZ);
+       else
+               /* Throttle until kswapd wakes the process */
+               wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
+                       allow_direct_reclaim(pgdat));
  
-               goto check_pending;
-       }
-
-       /* Throttle until kswapd wakes the process */
-       wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
-               allow_direct_reclaim(pgdat));
-
-check_pending:
         if (fatal_signal_pending(current))
                 return true;
  
@@ -3673,7 +3724,7 @@ static void age_active_anon(struct pglist_data *pgdat,
         struct mem_cgroup *memcg;
         struct lruvec *lruvec;
  
-       if (!total_swap_pages)
+       if (!can_age_anon_pages(pgdat, sc))
                 return;
  
         lruvec = mem_cgroup_lruvec(NULL, pgdat);
@@ -4380,23 +4431,20 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
   * This kswapd start function will be called by init and node-hot-add.
   * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
   */
-int kswapd_run(int nid)
+void kswapd_run(int nid)
  {
         pg_data_t *pgdat = NODE_DATA(nid);
-       int ret = 0;
  
         if (pgdat->kswapd)
-               return 0;
+               return;
  
         pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
         if (IS_ERR(pgdat->kswapd)) {
                 /* failure at boot is fatal */
                 BUG_ON(system_state < SYSTEM_RUNNING);
                 pr_err("Failed to start kswapd on node %d\n", nid);
-               ret = PTR_ERR(pgdat->kswapd);
                 pgdat->kswapd = NULL;
         }
-       return ret;
  }
  
  /*