Merge branch 'akpm' (patches from Andrew)
[linux-2.6-microblaze.git] / mm / vmscan.c
index dff5112..5199b96 100644 (file)
@@ -187,26 +187,36 @@ static DECLARE_RWSEM(shrinker_rwsem);
 #ifdef CONFIG_MEMCG
 static int shrinker_nr_max;
 
+/* The shrinker_info is expanded in a batch of BITS_PER_LONG */
 static inline int shrinker_map_size(int nr_items)
 {
        return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
 }
 
-static void free_shrinker_map_rcu(struct rcu_head *head)
+static inline int shrinker_defer_size(int nr_items)
 {
-       kvfree(container_of(head, struct memcg_shrinker_map, rcu));
+       return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
 }
 
-static int expand_one_shrinker_map(struct mem_cgroup *memcg,
-                                  int size, int old_size)
+static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
+                                                    int nid)
 {
-       struct memcg_shrinker_map *new, *old;
+       return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
+                                        lockdep_is_held(&shrinker_rwsem));
+}
+
+static int expand_one_shrinker_info(struct mem_cgroup *memcg,
+                                   int map_size, int defer_size,
+                                   int old_map_size, int old_defer_size)
+{
+       struct shrinker_info *new, *old;
        struct mem_cgroup_per_node *pn;
        int nid;
+       int size = map_size + defer_size;
 
        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
-               old = rcu_dereference_protected(pn->shrinker_map, true);
+               old = shrinker_info_protected(memcg, nid);
                /* Not yet online memcg */
                if (!old)
                        return 0;
@@ -215,67 +225,79 @@ static int expand_one_shrinker_map(struct mem_cgroup *memcg,
                if (!new)
                        return -ENOMEM;
 
-               /* Set all old bits, clear all new bits */
-               memset(new->map, (int)0xff, old_size);
-               memset((void *)new->map + old_size, 0, size - old_size);
+               new->nr_deferred = (atomic_long_t *)(new + 1);
+               new->map = (void *)new->nr_deferred + defer_size;
+
+               /* map: set all old bits, clear all new bits */
+               memset(new->map, (int)0xff, old_map_size);
+               memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
+               /* nr_deferred: copy old values, clear all new values */
+               memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
+               memset((void *)new->nr_deferred + old_defer_size, 0,
+                      defer_size - old_defer_size);
 
-               rcu_assign_pointer(pn->shrinker_map, new);
-               call_rcu(&old->rcu, free_shrinker_map_rcu);
+               rcu_assign_pointer(pn->shrinker_info, new);
+               kvfree_rcu(old, rcu);
        }
 
        return 0;
 }
 
-void free_shrinker_maps(struct mem_cgroup *memcg)
+void free_shrinker_info(struct mem_cgroup *memcg)
 {
        struct mem_cgroup_per_node *pn;
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        int nid;
 
-       if (mem_cgroup_is_root(memcg))
-               return;
-
        for_each_node(nid) {
                pn = memcg->nodeinfo[nid];
-               map = rcu_dereference_protected(pn->shrinker_map, true);
-               kvfree(map);
-               rcu_assign_pointer(pn->shrinker_map, NULL);
+               info = rcu_dereference_protected(pn->shrinker_info, true);
+               kvfree(info);
+               rcu_assign_pointer(pn->shrinker_info, NULL);
        }
 }
 
-int alloc_shrinker_maps(struct mem_cgroup *memcg)
+int alloc_shrinker_info(struct mem_cgroup *memcg)
 {
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        int nid, size, ret = 0;
-
-       if (mem_cgroup_is_root(memcg))
-               return 0;
+       int map_size, defer_size = 0;
 
        down_write(&shrinker_rwsem);
-       size = shrinker_map_size(shrinker_nr_max);
+       map_size = shrinker_map_size(shrinker_nr_max);
+       defer_size = shrinker_defer_size(shrinker_nr_max);
+       size = map_size + defer_size;
        for_each_node(nid) {
-               map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
-               if (!map) {
-                       free_shrinker_maps(memcg);
+               info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
+               if (!info) {
+                       free_shrinker_info(memcg);
                        ret = -ENOMEM;
                        break;
                }
-               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
+               info->nr_deferred = (atomic_long_t *)(info + 1);
+               info->map = (void *)info->nr_deferred + defer_size;
+               rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
        }
        up_write(&shrinker_rwsem);
 
        return ret;
 }
 
-static int expand_shrinker_maps(int new_id)
+static inline bool need_expand(int nr_max)
+{
+       return round_up(nr_max, BITS_PER_LONG) >
+              round_up(shrinker_nr_max, BITS_PER_LONG);
+}
+
+static int expand_shrinker_info(int new_id)
 {
-       int size, old_size, ret = 0;
+       int ret = 0;
        int new_nr_max = new_id + 1;
+       int map_size, defer_size = 0;
+       int old_map_size, old_defer_size = 0;
        struct mem_cgroup *memcg;
 
-       size = shrinker_map_size(new_nr_max);
-       old_size = shrinker_map_size(shrinker_nr_max);
-       if (size <= old_size)
+       if (!need_expand(new_nr_max))
                goto out;
 
        if (!root_mem_cgroup)
@@ -283,11 +305,15 @@ static int expand_shrinker_maps(int new_id)
 
        lockdep_assert_held(&shrinker_rwsem);
 
+       map_size = shrinker_map_size(new_nr_max);
+       defer_size = shrinker_defer_size(new_nr_max);
+       old_map_size = shrinker_map_size(shrinker_nr_max);
+       old_defer_size = shrinker_defer_size(shrinker_nr_max);
+
        memcg = mem_cgroup_iter(NULL, NULL, NULL);
        do {
-               if (mem_cgroup_is_root(memcg))
-                       continue;
-               ret = expand_one_shrinker_map(memcg, size, old_size);
+               ret = expand_one_shrinker_info(memcg, map_size, defer_size,
+                                              old_map_size, old_defer_size);
                if (ret) {
                        mem_cgroup_iter_break(NULL, memcg);
                        goto out;
@@ -303,44 +329,34 @@ out:
 void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
 {
        if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
-               struct memcg_shrinker_map *map;
+               struct shrinker_info *info;
 
                rcu_read_lock();
-               map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
+               info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
                /* Pairs with smp mb in shrink_slab() */
                smp_mb__before_atomic();
-               set_bit(shrinker_id, map->map);
+               set_bit(shrinker_id, info->map);
                rcu_read_unlock();
        }
 }
 
-/*
- * We allow subsystems to populate their shrinker-related
- * LRU lists before register_shrinker_prepared() is called
- * for the shrinker, since we don't want to impose
- * restrictions on their internal registration order.
- * In this case shrink_slab_memcg() may find corresponding
- * bit is set in the shrinkers map.
- *
- * This value is used by the function to detect registering
- * shrinkers and to skip do_shrink_slab() calls for them.
- */
-#define SHRINKER_REGISTERING ((struct shrinker *)~0UL)
-
 static DEFINE_IDR(shrinker_idr);
 
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
        int id, ret = -ENOMEM;
 
+       if (mem_cgroup_disabled())
+               return -ENOSYS;
+
        down_write(&shrinker_rwsem);
        /* This may call shrinker, so it must use down_read_trylock() */
-       id = idr_alloc(&shrinker_idr, SHRINKER_REGISTERING, 0, 0, GFP_KERNEL);
+       id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
        if (id < 0)
                goto unlock;
 
        if (id >= shrinker_nr_max) {
-               if (expand_shrinker_maps(id)) {
+               if (expand_shrinker_info(id)) {
                        idr_remove(&shrinker_idr, id);
                        goto unlock;
                }
@@ -358,9 +374,51 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker)
 
        BUG_ON(id < 0);
 
-       down_write(&shrinker_rwsem);
+       lockdep_assert_held(&shrinker_rwsem);
+
        idr_remove(&shrinker_idr, id);
-       up_write(&shrinker_rwsem);
+}
+
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       struct shrinker_info *info;
+
+       info = shrinker_info_protected(memcg, nid);
+       return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
+}
+
+void reparent_shrinker_deferred(struct mem_cgroup *memcg)
+{
+       int i, nid;
+       long nr;
+       struct mem_cgroup *parent;
+       struct shrinker_info *child_info, *parent_info;
+
+       parent = parent_mem_cgroup(memcg);
+       if (!parent)
+               parent = root_mem_cgroup;
+
+       /* Prevent from concurrent shrinker_info expand */
+       down_read(&shrinker_rwsem);
+       for_each_node(nid) {
+               child_info = shrinker_info_protected(memcg, nid);
+               parent_info = shrinker_info_protected(parent, nid);
+               for (i = 0; i < shrinker_nr_max; i++) {
+                       nr = atomic_long_read(&child_info->nr_deferred[i]);
+                       atomic_long_add(nr, &parent_info->nr_deferred[i]);
+               }
+       }
+       up_read(&shrinker_rwsem);
 }
 
 static bool cgroup_reclaim(struct scan_control *sc)
@@ -394,13 +452,25 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 #else
 static int prealloc_memcg_shrinker(struct shrinker *shrinker)
 {
-       return 0;
+       return -ENOSYS;
 }
 
 static void unregister_memcg_shrinker(struct shrinker *shrinker)
 {
 }
 
+static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
+                                  struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
+static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
+                                 struct mem_cgroup *memcg)
+{
+       return 0;
+}
+
 static bool cgroup_reclaim(struct scan_control *sc)
 {
        return false;
@@ -412,6 +482,39 @@ static bool writeback_throttling_sane(struct scan_control *sc)
 }
 #endif
 
+static long xchg_nr_deferred(struct shrinker *shrinker,
+                            struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return xchg_nr_deferred_memcg(nid, shrinker,
+                                             sc->memcg);
+
+       return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+}
+
+
+static long add_nr_deferred(long nr, struct shrinker *shrinker,
+                           struct shrink_control *sc)
+{
+       int nid = sc->nid;
+
+       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+               nid = 0;
+
+       if (sc->memcg &&
+           (shrinker->flags & SHRINKER_MEMCG_AWARE))
+               return add_nr_deferred_memcg(nr, nid, shrinker,
+                                            sc->memcg);
+
+       return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
+}
+
 /*
  * This misses isolated pages which are not accounted for to save counters.
  * As the data only determines if reclaim or compaction continues, it is
@@ -461,8 +564,18 @@ static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
  */
 int prealloc_shrinker(struct shrinker *shrinker)
 {
-       unsigned int size = sizeof(*shrinker->nr_deferred);
+       unsigned int size;
+       int err;
+
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               err = prealloc_memcg_shrinker(shrinker);
+               if (err != -ENOSYS)
+                       return err;
+
+               shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
+       }
 
+       size = sizeof(*shrinker->nr_deferred);
        if (shrinker->flags & SHRINKER_NUMA_AWARE)
                size *= nr_node_ids;
 
@@ -470,26 +583,17 @@ int prealloc_shrinker(struct shrinker *shrinker)
        if (!shrinker->nr_deferred)
                return -ENOMEM;
 
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
-               if (prealloc_memcg_shrinker(shrinker))
-                       goto free_deferred;
-       }
-
        return 0;
-
-free_deferred:
-       kfree(shrinker->nr_deferred);
-       shrinker->nr_deferred = NULL;
-       return -ENOMEM;
 }
 
 void free_prealloced_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
-               return;
-
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
+               down_write(&shrinker_rwsem);
                unregister_memcg_shrinker(shrinker);
+               up_write(&shrinker_rwsem);
+               return;
+       }
 
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
@@ -499,10 +603,7 @@ void register_shrinker_prepared(struct shrinker *shrinker)
 {
        down_write(&shrinker_rwsem);
        list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               idr_replace(&shrinker_idr, shrinker, shrinker->id);
-#endif
+       shrinker->flags |= SHRINKER_REGISTERED;
        up_write(&shrinker_rwsem);
 }
 
@@ -522,13 +623,16 @@ EXPORT_SYMBOL(register_shrinker);
  */
 void unregister_shrinker(struct shrinker *shrinker)
 {
-       if (!shrinker->nr_deferred)
+       if (!(shrinker->flags & SHRINKER_REGISTERED))
                return;
-       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
-               unregister_memcg_shrinker(shrinker);
+
        down_write(&shrinker_rwsem);
        list_del(&shrinker->list);
+       shrinker->flags &= ~SHRINKER_REGISTERED;
+       if (shrinker->flags & SHRINKER_MEMCG_AWARE)
+               unregister_memcg_shrinker(shrinker);
        up_write(&shrinker_rwsem);
+
        kfree(shrinker->nr_deferred);
        shrinker->nr_deferred = NULL;
 }
@@ -545,14 +649,10 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
        long freeable;
        long nr;
        long new_nr;
-       int nid = shrinkctl->nid;
        long batch_size = shrinker->batch ? shrinker->batch
                                          : SHRINK_BATCH;
        long scanned = 0, next_deferred;
 
-       if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
-               nid = 0;
-
        freeable = shrinker->count_objects(shrinker, shrinkctl);
        if (freeable == 0 || freeable == SHRINK_EMPTY)
                return freeable;
@@ -562,9 +662,8 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
         * and zero it so that other concurrent shrinker invocations
         * don't also do this scanning work.
         */
-       nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+       nr = xchg_nr_deferred(shrinker, shrinkctl);
 
-       total_scan = nr;
        if (shrinker->seeks) {
                delta = freeable >> priority;
                delta *= 4;
@@ -578,37 +677,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                delta = freeable / 2;
        }
 
+       total_scan = nr >> priority;
        total_scan += delta;
-       if (total_scan < 0) {
-               pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
-                      shrinker->scan_objects, total_scan);
-               total_scan = freeable;
-               next_deferred = nr;
-       } else
-               next_deferred = total_scan;
-
-       /*
-        * We need to avoid excessive windup on filesystem shrinkers
-        * due to large numbers of GFP_NOFS allocations causing the
-        * shrinkers to return -1 all the time. This results in a large
-        * nr being built up so when a shrink that can do some work
-        * comes along it empties the entire cache due to nr >>>
-        * freeable. This is bad for sustaining a working set in
-        * memory.
-        *
-        * Hence only allow the shrinker to scan the entire cache when
-        * a large delta change is calculated directly.
-        */
-       if (delta < freeable / 4)
-               total_scan = min(total_scan, freeable / 2);
-
-       /*
-        * Avoid risking looping forever due to too large nr value:
-        * never try to free more than twice the estimate number of
-        * freeable entries.
-        */
-       if (total_scan > freeable * 2)
-               total_scan = freeable * 2;
+       total_scan = min(total_scan, (2 * freeable));
 
        trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
                                   freeable, delta, total_scan, priority);
@@ -647,20 +718,20 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
                cond_resched();
        }
 
-       if (next_deferred >= scanned)
-               next_deferred -= scanned;
-       else
-               next_deferred = 0;
+       /*
+        * The deferred work is increased by any new work (delta) that wasn't
+        * done, decreased by old deferred work that was done now.
+        *
+        * And it is capped to two times of the freeable items.
+        */
+       next_deferred = max_t(long, (nr + delta - scanned), 0);
+       next_deferred = min(next_deferred, (2 * freeable));
+
        /*
         * move the unused scan count back into the shrinker in a
-        * manner that handles concurrent updates. If we exhausted the
-        * scan, there is no need to do an update.
+        * manner that handles concurrent updates.
         */
-       if (next_deferred > 0)
-               new_nr = atomic_long_add_return(next_deferred,
-                                               &shrinker->nr_deferred[nid]);
-       else
-               new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
+       new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
 
        trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
        return freed;
@@ -670,7 +741,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
 static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                        struct mem_cgroup *memcg, int priority)
 {
-       struct memcg_shrinker_map *map;
+       struct shrinker_info *info;
        unsigned long ret, freed = 0;
        int i;
 
@@ -680,12 +751,11 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
        if (!down_read_trylock(&shrinker_rwsem))
                return 0;
 
-       map = rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_map,
-                                       true);
-       if (unlikely(!map))
+       info = shrinker_info_protected(memcg, nid);
+       if (unlikely(!info))
                goto unlock;
 
-       for_each_set_bit(i, map->map, shrinker_nr_max) {
+       for_each_set_bit(i, info->map, shrinker_nr_max) {
                struct shrink_control sc = {
                        .gfp_mask = gfp_mask,
                        .nid = nid,
@@ -694,9 +764,9 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
                struct shrinker *shrinker;
 
                shrinker = idr_find(&shrinker_idr, i);
-               if (unlikely(!shrinker || shrinker == SHRINKER_REGISTERING)) {
+               if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
                        if (!shrinker)
-                               clear_bit(i, map->map);
+                               clear_bit(i, info->map);
                        continue;
                }
 
@@ -707,7 +777,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
 
                ret = do_shrink_slab(&sc, shrinker, priority);
                if (ret == SHRINK_EMPTY) {
-                       clear_bit(i, map->map);
+                       clear_bit(i, info->map);
                        /*
                         * After the shrinker reported that it had no objects to
                         * free, but before we cleared the corresponding bit in
@@ -3989,7 +4059,7 @@ static int kswapd(void *p)
 {
        unsigned int alloc_order, reclaim_order;
        unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
-       pg_data_t *pgdat = (pg_data_t*)p;
+       pg_data_t *pgdat = (pg_data_t *)p;
        struct task_struct *tsk = current;
        const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);