Merge tag 'block-5.13-2021-05-14' of git://git.kernel.dk/linux-block
[linux-2.6-microblaze.git] / mm / vmscan.c
index 562e87c..5199b96 100644 (file)
@@ -185,39 +185,181 @@ static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
 #ifdef CONFIG_MEMCG
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
+static int shrinker_nr_max;
+
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
+static inline int shrinker_map_size(int nr_items)
+{
+       return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
+}
+
+static inline int shrinker_defer_size(int nr_items)
+{
+       return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
+}
+
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+                                                    int nid)
+{
+       return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+                                        lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+                                   int map_size, int defer_size,
+                                   int old_map_size, int old_defer_size)
+{
+       struct shrinker_info *new, *old;
+       struct mem_cgroup_per_node *pn;
+       int nid;
+       int size = map_size + defer_size;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               old = shrinker_info_protected(memcg, nid);
+               /* Not yet online memcg */
+               if (!old)
+                       return 0;
+
+               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
+               if (!new)
+                       return -ENOMEM;
+
+               new->nr_deferred = (atomic_long_t *)(new + 1);
+               new->map = (void *)new->nr_deferred + defer_size;
+
+               /* map: set all old bits, clear all new bits */
+               memset(new->map, (int)0xff, old_map_size);
+               memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+               /* nr_deferred: copy old values, clear all new values */
+               memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+               memset((void *)new->nr_deferred + old_defer_size, 0,
+                      defer_size - old_defer_size);
+
+               rcu_assign_pointer(pn->shrinker_info, new);
+               kvfree_rcu(old, rcu);
+       }
+
+       return 0;
+}
+
+void free_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct mem_cgroup_per_node *pn;
+       struct shrinker_info *info;
+       int nid;
+
+       for_each_node(nid) {
+               pn = memcg->nodeinfo[nid];
+               info = rcu_dereference_protected(pn->shrinker_info, true);
+               kvfree(info);
+               rcu_assign_pointer(pn->shrinker_info, NULL);
+       }
+}
+
+int alloc_shrinker_info(struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+       int nid, size, ret = 0;
+       int map_size, defer_size = 0;
+
+       down_write(&shrinker_rwsem);
+       map_size = shrinker_map_size(shrinker_nr_max);
+       defer_size = shrinker_defer_size(shrinker_nr_max);
+       size = map_size + defer_size;
+       for_each_node(nid) {
+               info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+               if (!info) {
+                       free_shrinker_info(memcg);
+                       ret = -ENOMEM;
+                       break;
+               }
+               info->nr_deferred = (atomic_long_t *)(info + 1);
+               info->map = (void *)info->nr_deferred + defer_size;
+               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
+       }
+       up_write(&shrinker_rwsem);
+
+       return ret;
+}
+
+static inline bool need_expand(int nr_max)
+{
+       return round_up(nr_max, BITS_PER_LONG) >
+              round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
+{
+       int ret = 0;
+       int new_nr_max = new_id + 1;
+       int map_size, defer_size = 0;
+       int old_map_size, old_defer_size = 0;
+       struct mem_cgroup *memcg;
+
+       if (!need_expand(new_nr_max))
+               goto out;
+
+       if (!root_mem_cgroup)
+               goto out;
+
+       lockdep_assert_held(&shrinker_rwsem);
+
+       map_size = shrinker_map_size(new_nr_max);
+       defer_size = shrinker_defer_size(new_nr_max);
+       old_map_size = shrinker_map_size(shrinker_nr_max);
+       old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
+       memcg = mem_cgroup_iter(NULL, NULL, NULL);
+       do {
+               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+                                              old_map_size, old_defer_size);
+               if (ret) {
+                       mem_cgroup_iter_break(NULL, memcg);
+                       goto out;
+               }
+       } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+out:
+       if (!ret)
+               shrinker_nr_max = new_nr_max;
+
+       return ret;
+}
+
+void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
+{
+       if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
+               struct shrinker_info *info;
+
+               rcu_read_lock();
+               info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
+               /* Pairs with smp mb in shrink_slab() */
+               smp_mb__before_atomic();
+               set_bit(shrinker_id, info->map);
+               rcu_read_unlock();
+       }
+}
 
 static DEFINE_IDR(shrinker_idr);
-static int shrinker_nr_max;
 
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
        int id, ret = -ENOMEM;
 
+       if (mem_cgroup_disabled())
+               return -ENOSYS;
+
        down_write(&shrinker_rwsem);
        /* This may call shrinker, so it must use down_read_trylock() */
-       id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+       id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;
 
        if (id >= shrinker_nr_max) {
-               if (memcg_expand_shrinker_maps(id)) {
+               if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
-
-               shrinker_nr_max = id + 1;
        }
        shrinker->id = id;
        ret = 0;
@@ -232,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 
        BUG_ON(id < 0);
 
-       down_write(&shrinker_rwsem);
+       lockdep_assert_held(&shrinker_rwsem);
+
        idr_remove(&shrinker_idr, id);
-       up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+       int i, nid;
+       long nr;
+       struct mem_cgroup *parent;
+       struct shrinker_info *child_info, *parent_info;
+
+       parent = parent_mem_cgroup(memcg);
+       if (!parent)
+               parent = root_mem_cgroup;
+
+       /* Prevent from concurrent shrinker_info expand */
+       down_read(&shrinker_rwsem);
+       for_each_node(nid) {
+               child_info = shrinker_info_protected(memcg, nid);
+               parent_info = shrinker_info_protected(parent, nid);
+               for (i = 0; i < shrinker_nr_max; i++) {
+                       nr = atomic_long_read(&child_info->nr_deferred[i]);
+                       atomic_long_add(nr, &parent_info->nr_deferred[i]);
+               }
+       }
+       up_read(&shrinker_rwsem);
 }
 
 static bool cgroup_reclaim(struct scan_control *sc)
@@ -268,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #else
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
-       return 0;
+       return -ENOSYS;
 }
 
 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 {
 }
 
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
 static bool cgroup_reclaim(struct scan_control *sc)
 {
        return false;
@@ -286,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static long xchg_nr_deferred(struct shrinker *shrinker,
+                            struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return xchg_nr_deferred_memcg(nid, shrinker,
+                                             sc->memcg);
+
+       return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+                           struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return add_nr_deferred_memcg(nr, nid, shrinker,
+                                            sc->memcg);
+
+       return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -335,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
  */
 int prealloc_shrinker(struct shrinker *shrinker)
 {
-       unsigned int size = sizeof(*shrinker->nr_deferred);
+       unsigned int size;
+       int err;
+
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               err = prealloc_memcg_shrinker(shrinker);
+               if (err != -ENOSYS)
+                       return err;
+
+               shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+       }
 
+       size = sizeof(*shrinker->nr_deferred);
        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;
 
@@ -344,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
        if (!shrinker->nr_deferred)
                return -ENOMEM;
 
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-               if (prealloc_memcg_shrinker(shrinker))
-                       goto free_deferred;
-       }
-
        return 0;
-
-free_deferred:
-       kfree(shrinker->nr_deferred);
-       shrinker->nr_deferred = NULL;
-       return -ENOMEM;
 }
 
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
-               return;
-
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               down_write(&shrinker_rwsem);
                unregister_memcg_shrinker(shrinker);
+               up_write(&shrinker_rwsem);
+               return;
+       }
 
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
@@ -373,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+       shrinker->flags |= SHRINKER_REGISTERED;
        up_write(&shrinker_rwsem);
 }
 
@@ -396,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
+       if (!(shrinker->flags & SHRINKER_REGISTERED))
                return;
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               unregister_memcg_shrinker(shrinker);
+
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
+       shrinker->flags &= ~SHRINKER_REGISTERED;
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+               unregister_memcg_shrinker(shrinker);
        up_write(&shrinker_rwsem);
+
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
 }
@@ -419,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        long freeable;
        long nr;
        long new_nr;
-       int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;
 
-       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-               nid = 0;
-
        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;
@@ -436,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
-       nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+       nr = xchg_nr_deferred(shrinker, shrinkctl);
 
-       total_scan = nr;
        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
@@ -452,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                delta = freeable / 2;
        }
 
+       total_scan = nr >> priority;
        total_scan += delta;
-       if (total_scan < 0) {
-               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
-                      shrinker->scan_objects, total_scan);
-               total_scan = freeable;
-               next_deferred = nr;
-       } else
-               next_deferred = total_scan;
-
-       /*
-        * We need to avoid excessive windup on filesystem shrinkers
-        * due to large numbers of GFP_NOFS allocations causing the
-        * shrinkers to return -1 all the time. This results in a large
-        * nr being built up so when a shrink that can do some work
-        * comes along it empties the entire cache due to nr >>>
-        * freeable. This is bad for sustaining a working set in
-        * memory.
-        *
-        * Hence only allow the shrinker to scan the entire cache when
-        * a large delta change is calculated directly.
-        */
-       if (delta < freeable / 4)
-               total_scan = min(total_scan, freeable / 2);
-
-       /*
-        * Avoid risking looping forever due to too large nr value:
-        * never try to free more than twice the estimate number of
-        * freeable entries.
-        */
-       if (total_scan > freeable * 2)
-               total_scan = freeable * 2;
+       total_scan = min(total_scan, (2 * freeable));
 
        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);
@@ -521,22 +718,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                cond_resched();
        }
 
-       if (next_deferred >= scanned)
-               next_deferred -= scanned;
-       else
-               next_deferred = 0;
+       /*
+        * The deferred work is increased by any new work (delta) that wasn't
+        * done, decreased by old deferred work that was done now.
+        *
+        * And it is capped to two times of the freeable items.
+        */
+       next_deferred = max_t(long, (nr + delta - scanned), 0);
+       next_deferred = min(next_deferred, (2 * freeable));
+
        /*
         * move the unused scan count back into the shrinker in a
-        * manner that handles concurrent updates. If we exhausted the
-        * scan, there is no need to do an update.
+        * manner that handles concurrent updates.
         */
-       if (next_deferred > 0)
-               new_nr = atomic_long_add_return(next_deferred,
-                                               &shrinker->nr_deferred[nid]);
-       else
-               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+       new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
-       trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan);
+       trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
 }
 
@@ -544,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
 {
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int i;
 
@@ -554,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
        if (!down_read_trylock(&shrinker_rwsem))
                return 0;
 
-       map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
-                                       true);
-       if (unlikely(!map))
+       info = shrinker_info_protected(memcg, nid);
+       if (unlikely(!info))
                goto unlock;
 
-       for_each_set_bit(i, map->map, shrinker_nr_max) {
+       for_each_set_bit(i, info->map, shrinker_nr_max) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
@@ -568,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                struct shrinker *shrinker;
 
                shrinker = idr_find(&shrinker_idr, i);
-               if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+               if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
                        if (!shrinker)
-                               clear_bit(i, map->map);
+                               clear_bit(i, info->map);
                        continue;
                }
 
@@ -581,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 
                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY) {
-                       clear_bit(i, map->map);
+                       clear_bit(i, info->map);
                        /*
                         * After the shrinker reported that it had no objects to
                         * free, but before we cleared the corresponding bit in
@@ -590,7 +786,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                         * case, we invoke the shrinker one more time and reset
                         * the bit if it reports that it is not empty anymore.
                         * The memory barrier here pairs with the barrier in
-                        * memcg_set_shrinker_bit():
+                        * set_shrinker_bit():
                         *
                         * list_lru_add()     shrink_slab_memcg()
                         *   list_add_tail()    clear_bit()
@@ -602,7 +798,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        if (ret == SHRINK_EMPTY)
                                ret = 0;
                        else
-                               memcg_set_shrinker_bit(memcg, nid, i);
+                               set_shrinker_bit(memcg, nid, i);
                }
                freed += ret;
 
@@ -1507,8 +1703,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
        LIST_HEAD(clean_pages);
 
        list_for_each_entry_safe(page, next, page_list, lru) {
-               if (page_is_file_lru(page) && !PageDirty(page) &&
-                   !__PageMovable(page) && !PageUnevictable(page)) {
+               if (!PageHuge(page) && page_is_file_lru(page) &&
+                   !PageDirty(page) && !__PageMovable(page) &&
+                   !PageUnevictable(page)) {
                        ClearPageActive(page);
                        list_move(&page->lru, &clean_pages);
                }
@@ -3862,7 +4059,7 @@ static int kswapd(void *p)
 {
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
 
@@ -4085,14 +4282,6 @@ module_init(kswapd_init)
  */
 int node_reclaim_mode __read_mostly;
 
-/*
- * These bit locations are exposed in the vm.zone_reclaim_mode sysctl
- * ABI.  New bits are OK, but existing bits can never change.
- */
-#define RECLAIM_ZONE  (1<<0)   /* Run shrink_inactive_list on the zone */
-#define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
-#define RECLAIM_UNMAP (1<<2)   /* Unmap pages during reclaim */
-
 /*
  * Priority for NODE_RECLAIM. This determines the fraction of pages
  * of a node considered for each zone_reclaim. 4 scans 1/16th of