Merge branch 'work.misc' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs

[linux-2.6-microblaze.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 5cb2a58..8d9ceea 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
  
  struct mem_cgroup *root_mem_cgroup __read_mostly;
  
-#define MEM_CGROUP_RECLAIM_RETRIES     5
-
  /* Socket memory accounting disabled? */
  static bool cgroup_memory_nosocket;
  
@@ -350,7 +348,7 @@ static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
  }
  
  /*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
   * The main reason for not using cgroup id for this:
   *  this works better in sparse environments, where we have a lot of memcgs,
   *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -393,14 +391,12 @@ void memcg_put_cache_ids(void)
  
  /*
   * A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
   * conditional to this static branch, we'll have to allow modules that does
   * kmem_cache_alloc and the such to see this symbol as well
   */
  DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
  EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
  #endif
  
  static int memcg_shrinker_map_size;
@@ -569,20 +565,16 @@ ino_t page_cgroup_ino(struct page *page)
         unsigned long ino = 0;
  
         rcu_read_lock();
-       if (PageSlab(page) && !PageTail(page)) {
-               memcg = memcg_from_slab_page(page);
-       } else {
-               memcg = page->mem_cgroup;
+       memcg = page->mem_cgroup;
  
-               /*
-                * The lowest bit set means that memcg isn't a valid
-                * memcg pointer, but a obj_cgroups pointer.
-                * In this case the page is shared and doesn't belong
-                * to any specific memory cgroup.
-                */
-               if ((unsigned long) memcg & 0x1UL)
-                       memcg = NULL;
-       }
+       /*
+        * The lowest bit set means that memcg isn't a valid
+        * memcg pointer, but a obj_cgroups pointer.
+        * In this case the page is shared and doesn't belong
+        * to any specific memory cgroup.
+        */
+       if ((unsigned long) memcg & 0x1UL)
+               memcg = NULL;
  
         while (memcg && !(memcg->css.flags & CSS_ONLINE))
                 memcg = parent_mem_cgroup(memcg);
@@ -1491,7 +1483,7 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                        (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
                        PAGE_SIZE);
         seq_buf_printf(&s, "kernel_stack %llu\n",
-                      (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+                      (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
                        1024);
         seq_buf_printf(&s, "slab %llu\n",
                        (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
@@ -1671,15 +1663,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                 .gfp_mask = gfp_mask,
                 .order = order,
         };
-       bool ret;
+       bool ret = true;
  
         if (mutex_lock_killable(&oom_lock))
                 return true;
+
+       if (mem_cgroup_margin(memcg) >= (1 << order))
+               goto unlock;
+
         /*
          * A few threads which were not waiting at mutex_lock_killable() can
          * fail to bail out. Therefore, check again after holding oom_lock.
          */
         ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
         mutex_unlock(&oom_lock);
         return ret;
  }
@@ -2369,18 +2367,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
         return 0;
  }
  
-static void reclaim_high(struct mem_cgroup *memcg,
-                        unsigned int nr_pages,
-                        gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+                                 unsigned int nr_pages,
+                                 gfp_t gfp_mask)
  {
+       unsigned long nr_reclaimed = 0;
+
         do {
+               unsigned long pflags;
+
                 if (page_counter_read(&memcg->memory) <=
                     READ_ONCE(memcg->memory.high))
                         continue;
+
                 memcg_memory_event(memcg, MEMCG_HIGH);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+               psi_memstall_enter(&pflags);
+               nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+                                                            gfp_mask, true);
+               psi_memstall_leave(&pflags);
         } while ((memcg = parent_mem_cgroup(memcg)) &&
                  !mem_cgroup_is_root(memcg));
+
+       return nr_reclaimed;
  }
  
  static void high_work_func(struct work_struct *work)
@@ -2536,16 +2545,32 @@ void mem_cgroup_handle_over_high(void)
  {
         unsigned long penalty_jiffies;
         unsigned long pflags;
+       unsigned long nr_reclaimed;
         unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       int nr_retries = MAX_RECLAIM_RETRIES;
         struct mem_cgroup *memcg;
+       bool in_retry = false;
  
         if (likely(!nr_pages))
                 return;
  
         memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
         current->memcg_nr_pages_over_high = 0;
  
+retry_reclaim:
+       /*
+        * The allocating task should reclaim at least the batch size, but for
+        * subsequent retries we only want to do what's necessary to prevent oom
+        * or breaching resource isolation.
+        *
+        * This is distinct from memory.max or page allocator behaviour because
+        * memory.high is currently batched, whereas memory.max and the page
+        * allocator run every time an allocation is made.
+        */
+       nr_reclaimed = reclaim_high(memcg,
+                                   in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+                                   GFP_KERNEL);
+
         /*
          * memory.high is breached and reclaim is unable to keep up. Throttle
          * allocators proactively to slow down excessive growth.
@@ -2572,6 +2597,16 @@ void mem_cgroup_handle_over_high(void)
         if (penalty_jiffies <= HZ / 100)
                 goto out;
  
+       /*
+        * If reclaim is making forward progress but we're still over
+        * memory.high, we want to encourage that rather than doing allocator
+        * throttling.
+        */
+       if (nr_reclaimed || nr_retries--) {
+               in_retry = true;
+               goto retry_reclaim;
+       }
+
         /*
          * If we exit early, we're guaranteed to die (since
          * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
@@ -2589,13 +2624,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                       unsigned int nr_pages)
  {
         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
         struct mem_cgroup *mem_over_limit;
         struct page_counter *counter;
+       enum oom_status oom_status;
         unsigned long nr_reclaimed;
         bool may_swap = true;
         bool drained = false;
-       enum oom_status oom_status;
+       unsigned long pflags;
  
         if (mem_cgroup_is_root(memcg))
                 return 0;
@@ -2655,8 +2691,10 @@ retry:
  
         memcg_memory_event(mem_over_limit, MEMCG_MAX);
  
+       psi_memstall_enter(&pflags);
         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                     gfp_mask, may_swap);
+       psi_memstall_leave(&pflags);
  
         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                 goto retry;
@@ -2708,7 +2746,7 @@ retry:
                        get_order(nr_pages * PAGE_SIZE));
         switch (oom_status) {
         case OOM_SUCCESS:
-               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               nr_retries = MAX_RECLAIM_RETRIES;
                 goto retry;
         case OOM_FAILED:
                 goto force;
@@ -2806,6 +2844,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
  }
  
  #ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+                                gfp_t gfp)
+{
+       unsigned int objects = objs_per_slab_page(s, page);
+       void *vec;
+
+       vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+                          page_to_nid(page));
+       if (!vec)
+               return -ENOMEM;
+
+       if (cmpxchg(&page->obj_cgroups, NULL,
+                   (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+               kfree(vec);
+       else
+               kmemleak_not_leak(vec);
+
+       return 0;
+}
+
  /*
   * Returns a pointer to the memory cgroup to which the kernel object is charged.
   *
@@ -2822,12 +2880,21 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
         page = virt_to_head_page(p);
  
         /*
-        * Slab pages don't have page->mem_cgroup set because corresponding
-        * kmem caches can be reparented during the lifetime. That's why
-        * memcg_from_slab_page() should be used instead.
+        * Slab objects are accounted individually, not per-page.
+        * Memcg membership data for each individual object is saved in
+        * the page->obj_cgroups.
          */
-       if (PageSlab(page))
-               return memcg_from_slab_page(page);
+       if (page_has_obj_cgroups(page)) {
+               struct obj_cgroup *objcg;
+               unsigned int off;
+
+               off = obj_to_index(page->slab_cache, page, p);
+               objcg = page_obj_cgroups(page)[off];
+               if (objcg)
+                       return obj_cgroup_memcg(objcg);
+
+               return NULL;
+       }
  
         /* All other pages use page->mem_cgroup */
         return page->mem_cgroup;
@@ -2882,9 +2949,7 @@ static int memcg_alloc_cache_id(void)
         else if (size > MEMCG_CACHES_MAX_SIZE)
                 size = MEMCG_CACHES_MAX_SIZE;
  
-       err = memcg_update_all_caches(size);
-       if (!err)
-               err = memcg_update_all_list_lrus(size);
+       err = memcg_update_all_list_lrus(size);
         if (!err)
                 memcg_nr_cache_ids = size;
  
@@ -2902,148 +2967,6 @@ static void memcg_free_cache_id(int id)
         ida_simple_remove(&memcg_cache_ida, id);
  }
  
-struct memcg_kmem_cache_create_work {
-       struct mem_cgroup *memcg;
-       struct kmem_cache *cachep;
-       struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
-       struct memcg_kmem_cache_create_work *cw =
-               container_of(w, struct memcg_kmem_cache_create_work, work);
-       struct mem_cgroup *memcg = cw->memcg;
-       struct kmem_cache *cachep = cw->cachep;
-
-       memcg_create_kmem_cache(memcg, cachep);
-
-       css_put(&memcg->css);
-       kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                              struct kmem_cache *cachep)
-{
-       struct memcg_kmem_cache_create_work *cw;
-
-       if (!css_tryget_online(&memcg->css))
-               return;
-
-       cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
-       if (!cw) {
-               css_put(&memcg->css);
-               return;
-       }
-
-       cw->memcg = memcg;
-       cw->cachep = cachep;
-       INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
-       queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
-                                       struct obj_cgroup **objcgp)
-{
-       struct mem_cgroup *memcg;
-       struct kmem_cache *memcg_cachep;
-       struct memcg_cache_array *arr;
-       int kmemcg_id;
-
-       VM_BUG_ON(!is_root_cache(cachep));
-
-       if (memcg_kmem_bypass())
-               return cachep;
-
-       rcu_read_lock();
-
-       if (unlikely(current->active_memcg))
-               memcg = current->active_memcg;
-       else
-               memcg = mem_cgroup_from_task(current);
-
-       if (!memcg || memcg == root_mem_cgroup)
-               goto out_unlock;
-
-       kmemcg_id = READ_ONCE(memcg->kmemcg_id);
-       if (kmemcg_id < 0)
-               goto out_unlock;
-
-       arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
-       /*
-        * Make sure we will access the up-to-date value. The code updating
-        * memcg_caches issues a write barrier to match the data dependency
-        * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
-        */
-       memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
-       /*
-        * If we are in a safe context (can wait, and not in interrupt
-        * context), we could be be predictable and return right away.
-        * This would guarantee that the allocation being performed
-        * already belongs in the new cache.
-        *
-        * However, there are some clashes that can arrive from locking.
-        * For instance, because we acquire the slab_mutex while doing
-        * memcg_create_kmem_cache, this means no further allocation
-        * could happen with the slab_mutex held. So it's better to
-        * defer everything.
-        *
-        * If the memcg is dying or memcg_cache is about to be released,
-        * don't bother creating new kmem_caches. Because memcg_cachep
-        * is ZEROed as the fist step of kmem offlining, we don't need
-        * percpu_ref_tryget_live() here. css_tryget_online() check in
-        * memcg_schedule_kmem_cache_create() will prevent us from
-        * creation of a new kmem_cache.
-        */
-       if (unlikely(!memcg_cachep))
-               memcg_schedule_kmem_cache_create(memcg, cachep);
-       else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) {
-               struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
-
-               if (!objcg || !obj_cgroup_tryget(objcg)) {
-                       percpu_ref_put(&memcg_cachep->memcg_params.refcnt);
-                       goto out_unlock;
-               }
-
-               *objcgp = objcg;
-               cachep = memcg_cachep;
-       }
-out_unlock:
-       rcu_read_unlock();
-       return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
-       if (!is_root_cache(cachep))
-               percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
  /**
   * __memcg_kmem_charge: charge a number of kernel pages to a memcg
   * @memcg: memory cgroup to charge
@@ -3503,7 +3426,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
   */
  static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
  {
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
  
         /* we call try-to-free pages for make this cgroup empty */
         lru_add_drain_all();
@@ -3731,7 +3654,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
          */
         memcg->kmemcg_id = memcg_id;
         memcg->kmem_state = KMEM_ONLINE;
-       INIT_LIST_HEAD(&memcg->kmem_caches);
  
         return 0;
  }
@@ -3744,22 +3666,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
  
         if (memcg->kmem_state != KMEM_ONLINE)
                 return;
-       /*
-        * Clear the online state before clearing memcg_caches array
-        * entries. The slab_mutex in memcg_deactivate_kmem_caches()
-        * guarantees that no cache will be created for this cgroup
-        * after we are done (see memcg_create_kmem_cache()).
-        */
+
         memcg->kmem_state = KMEM_ALLOCATED;
  
         parent = parent_mem_cgroup(memcg);
         if (!parent)
                 parent = root_mem_cgroup;
  
-       /*
-        * Deactivate and reparent kmem_caches and objcgs.
-        */
-       memcg_deactivate_kmem_caches(memcg, parent);
         memcg_reparent_objcgs(memcg, parent);
  
         kmemcg_id = memcg->kmemcg_id;
@@ -5384,9 +5297,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
  
         /* The following stuff does not apply to the root */
         if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
-               INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
                 root_mem_cgroup = memcg;
                 return &memcg->css;
         }
@@ -6337,7 +6247,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                  char *buf, size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_retries = MAX_RECLAIM_RETRIES;
         bool drained = false;
         unsigned long high;
         int err;
@@ -6347,8 +6257,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
         if (err)
                 return err;
  
-       page_counter_set_high(&memcg->memory, high);
-
         for (;;) {
                 unsigned long nr_pages = page_counter_read(&memcg->memory);
                 unsigned long reclaimed;
@@ -6372,6 +6280,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                         break;
         }
  
+       page_counter_set_high(&memcg->memory, high);
+
+       memcg_wb_domain_size_changed(memcg);
+
         return nbytes;
  }
  
@@ -6385,7 +6297,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
         bool drained = false;
         unsigned long max;
         int err;
@@ -6692,40 +6604,42 @@ static unsigned long effective_protection(unsigned long usage,
   *
   * WARNING: This function is not stateless! It can only be used as part
   *          of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- *   MEMCG_PROT_NONE: cgroup memory is not protected
- *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
- *     an unprotected supply of reclaimable memory from other cgroups.
- *   MEMCG_PROT_MIN: cgroup memory is protected
   */
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
-                                               struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                    struct mem_cgroup *memcg)
  {
         unsigned long usage, parent_usage;
         struct mem_cgroup *parent;
  
         if (mem_cgroup_disabled())
-               return MEMCG_PROT_NONE;
+               return;
  
         if (!root)
                 root = root_mem_cgroup;
+
+       /*
+        * Effective values of the reclaim targets are ignored so they
+        * can be stale. Have a look at mem_cgroup_protection for more
+        * details.
+        * TODO: calculation should be more robust so that we do not need
+        * that special casing.
+        */
         if (memcg == root)
-               return MEMCG_PROT_NONE;
+               return;
  
         usage = page_counter_read(&memcg->memory);
         if (!usage)
-               return MEMCG_PROT_NONE;
+               return;
  
         parent = parent_mem_cgroup(memcg);
         /* No parent means a non-hierarchical mode on v1 memcg */
         if (!parent)
-               return MEMCG_PROT_NONE;
+               return;
  
         if (parent == root) {
                 memcg->memory.emin = READ_ONCE(memcg->memory.min);
                 memcg->memory.elow = READ_ONCE(memcg->memory.low);
-               goto out;
+               return;
         }
  
         parent_usage = page_counter_read(&parent->memory);
@@ -6739,14 +6653,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
                         READ_ONCE(memcg->memory.low),
                         READ_ONCE(parent->memory.elow),
                         atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
-       if (usage <= memcg->memory.emin)
-               return MEMCG_PROT_MIN;
-       else if (usage <= memcg->memory.elow)
-               return MEMCG_PROT_LOW;
-       else
-               return MEMCG_PROT_NONE;
  }
  
  /**
@@ -7121,17 +7027,6 @@ static int __init mem_cgroup_init(void)
  {
         int cpu, node;
  
-#ifdef CONFIG_MEMCG_KMEM
-       /*
-        * Kmem cache creation is mostly done with the slab_mutex held,
-        * so use a workqueue with limited concurrency to avoid stalling
-        * all worker threads in case lots of cgroups are created and
-        * destroyed simultaneously.
-        */
-       memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
-       BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
                                   memcg_hotplug_cpu_dead);