kbuild: split the build log of kallsyms
[linux-2.6-microblaze.git] / mm / memcontrol.c
index 13f559a..b807952 100644 (file)
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 
-#define MEM_CGROUP_RECLAIM_RETRIES     5
-
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket;
 
@@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+extern spinlock_t css_set_lock;
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+       struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+       struct mem_cgroup *memcg;
+       unsigned int nr_bytes;
+       unsigned int nr_pages;
+       unsigned long flags;
+
+       /*
+        * At this point all allocated objects are freed, and
+        * objcg->nr_charged_bytes can't have an arbitrary byte value.
+        * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+        *
+        * The following sequence can lead to it:
+        * 1) CPU0: objcg == stock->cached_objcg
+        * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+        *          PAGE_SIZE bytes are charged
+        * 3) CPU1: a process from another memcg is allocating something,
+        *          the stock if flushed,
+        *          objcg->nr_charged_bytes = PAGE_SIZE - 92
+        * 5) CPU0: we do release this object,
+        *          92 bytes are added to stock->nr_bytes
+        * 6) CPU0: stock is flushed,
+        *          92 bytes are added to objcg->nr_charged_bytes
+        *
+        * In the result, nr_charged_bytes == PAGE_SIZE.
+        * This page will be uncharged in obj_cgroup_release().
+        */
+       nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+       WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+       nr_pages = nr_bytes >> PAGE_SHIFT;
+
+       spin_lock_irqsave(&css_set_lock, flags);
+       memcg = obj_cgroup_memcg(objcg);
+       if (nr_pages)
+               __memcg_kmem_uncharge(memcg, nr_pages);
+       list_del(&objcg->list);
+       mem_cgroup_put(memcg);
+       spin_unlock_irqrestore(&css_set_lock, flags);
+
+       percpu_ref_exit(ref);
+       kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+       struct obj_cgroup *objcg;
+       int ret;
+
+       objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+       if (!objcg)
+               return NULL;
+
+       ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+                             GFP_KERNEL);
+       if (ret) {
+               kfree(objcg);
+               return NULL;
+       }
+       INIT_LIST_HEAD(&objcg->list);
+       return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+                                 struct mem_cgroup *parent)
+{
+       struct obj_cgroup *objcg, *iter;
+
+       objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+       spin_lock_irq(&css_set_lock);
+
+       /* Move active objcg to the parent's list */
+       xchg(&objcg->memcg, parent);
+       css_get(&parent->css);
+       list_add(&objcg->list, &parent->objcg_list);
+
+       /* Move already reparented objcgs to the parent's list */
+       list_for_each_entry(iter, &memcg->objcg_list, list) {
+               css_get(&parent->css);
+               xchg(&iter->memcg, parent);
+               css_put(&memcg->css);
+       }
+       list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+       spin_unlock_irq(&css_set_lock);
+
+       percpu_ref_kill(&objcg->refcnt);
+}
+
 /*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)
 
 /*
  * A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
  * conditional to this static branch, we'll have to allow modules that does
  * kmem_cache_alloc and the such to see this symbol as well
  */
 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
 #endif
 
 static int memcg_shrinker_map_size;
@@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)
        unsigned long ino = 0;
 
        rcu_read_lock();
-       if (PageSlab(page) && !PageTail(page))
-               memcg = memcg_from_slab_page(page);
-       else
-               memcg = READ_ONCE(page->mem_cgroup);
+       memcg = page->mem_cgroup;
+
+       /*
+        * The lowest bit set means that memcg isn't a valid
+        * memcg pointer, but a obj_cgroups pointer.
+        * In this case the page is shared and doesn't belong
+        * to any specific memory cgroup.
+        */
+       if ((unsigned long) memcg & 0x1UL)
+               memcg = NULL;
+
        while (memcg && !(memcg->css.flags & CSS_ONLINE))
                memcg = parent_mem_cgroup(memcg);
        if (memcg)
@@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
  */
 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 {
-       long x;
+       long x, threshold = MEMCG_CHARGE_BATCH;
 
        if (mem_cgroup_disabled())
                return;
 
+       if (memcg_stat_item_in_bytes(idx))
+               threshold <<= PAGE_SHIFT;
+
        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
-       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+       if (unlikely(abs(x) > threshold)) {
                struct mem_cgroup *mi;
 
                /*
@@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
        return mem_cgroup_nodeinfo(parent, nid);
 }
 
-/**
- * __mod_lruvec_state - update lruvec memory statistics
- * @lruvec: the lruvec
- * @idx: the stat item
- * @val: delta to add to the counter, can be negative
- *
- * The lruvec is the intersection of the NUMA node and a cgroup. This
- * function updates the all three counters that are affected by a
- * change of state at this level: per-node, per-cgroup, per-lruvec.
- */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
-                       int val)
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                             int val)
 {
-       pg_data_t *pgdat = lruvec_pgdat(lruvec);
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup *memcg;
-       long x;
-
-       /* Update node */
-       __mod_node_page_state(pgdat, idx, val);
-
-       if (mem_cgroup_disabled())
-               return;
+       long x, threshold = MEMCG_CHARGE_BATCH;
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
@@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        /* Update lruvec */
        __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
 
+       if (vmstat_item_in_bytes(idx))
+               threshold <<= PAGE_SHIFT;
+
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
-       if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+       if (unlikely(abs(x) > threshold)) {
+               pg_data_t *pgdat = lruvec_pgdat(lruvec);
                struct mem_cgroup_per_node *pi;
 
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
@@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
 }
 
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                       int val)
+{
+       /* Update node */
+       __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+       /* Update memcg and lruvec */
+       if (!mem_cgroup_disabled())
+               __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 {
        pg_data_t *pgdat = page_pgdat(virt_to_page(p));
@@ -1004,7 +1110,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
                                   struct mem_cgroup_reclaim_cookie *reclaim)
 {
-       struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+       struct mem_cgroup_reclaim_iter *iter;
        struct cgroup_subsys_state *css = NULL;
        struct mem_cgroup *memcg = NULL;
        struct mem_cgroup *pos = NULL;
@@ -1377,12 +1483,13 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                       (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
                       PAGE_SIZE);
        seq_buf_printf(&s, "kernel_stack %llu\n",
-                      (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+                      (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
                       1024);
        seq_buf_printf(&s, "slab %llu\n",
-                      (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
-                            memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
-                      PAGE_SIZE);
+                      (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+                            memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+       seq_buf_printf(&s, "percpu %llu\n",
+                      (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
        seq_buf_printf(&s, "sock %llu\n",
                       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
                       PAGE_SIZE);
@@ -1412,11 +1519,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                               PAGE_SIZE);
 
        seq_buf_printf(&s, "slab_reclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
-                      PAGE_SIZE);
+                      (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
        seq_buf_printf(&s, "slab_unreclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
-                      PAGE_SIZE);
+                      (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
 
        /* Accumulated memory events */
 
@@ -1425,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
                       memcg_events(memcg, PGMAJFAULT));
 
-       seq_buf_printf(&s, "workingset_refault %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_REFAULT));
-       seq_buf_printf(&s, "workingset_activate %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+       seq_buf_printf(&s, "workingset_refault_anon %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
+       seq_buf_printf(&s, "workingset_refault_file %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
+       seq_buf_printf(&s, "workingset_activate_anon %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
+       seq_buf_printf(&s, "workingset_activate_file %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
        seq_buf_printf(&s, "workingset_restore %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_RESTORE));
+                      memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
+       seq_buf_printf(&s, "workingset_restore %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
                       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
 
@@ -1560,15 +1671,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                .gfp_mask = gfp_mask,
                .order = order,
        };
-       bool ret;
+       bool ret = true;
 
        if (mutex_lock_killable(&oom_lock))
                return true;
+
+       if (mem_cgroup_margin(memcg) >= (1 << order))
+               goto unlock;
+
        /*
         * A few threads which were not waiting at mutex_lock_killable() can
         * fail to bail out. Therefore, check again after holding oom_lock.
         */
        ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
        mutex_unlock(&oom_lock);
        return ret;
 }
@@ -2039,6 +2156,12 @@ EXPORT_SYMBOL(unlock_page_memcg);
 struct memcg_stock_pcp {
        struct mem_cgroup *cached; /* this never be root cgroup */
        unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+       struct obj_cgroup *cached_objcg;
+       unsigned int nr_bytes;
+#endif
+
        struct work_struct work;
        unsigned long flags;
 #define FLUSHING_CACHED_CHARGE 0
@@ -2046,6 +2169,22 @@ struct memcg_stock_pcp {
 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
 static DEFINE_MUTEX(percpu_charge_mutex);
 
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg)
+{
+       return false;
+}
+#endif
+
 /**
  * consume_stock: Try to consume stocked charge on this cpu.
  * @memcg: memcg to consume from.
@@ -2086,13 +2225,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
 {
        struct mem_cgroup *old = stock->cached;
 
+       if (!old)
+               return;
+
        if (stock->nr_pages) {
                page_counter_uncharge(&old->memory, stock->nr_pages);
                if (do_memsw_account())
                        page_counter_uncharge(&old->memsw, stock->nr_pages);
-               css_put_many(&old->css, stock->nr_pages);
                stock->nr_pages = 0;
        }
+
+       css_put(&old->css);
        stock->cached = NULL;
 }
 
@@ -2108,6 +2251,7 @@ static void drain_local_stock(struct work_struct *dummy)
        local_irq_save(flags);
 
        stock = this_cpu_ptr(&memcg_stock);
+       drain_obj_stock(stock);
        drain_stock(stock);
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 
@@ -2128,6 +2272,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
        stock = this_cpu_ptr(&memcg_stock);
        if (stock->cached != memcg) { /* reset if necessary */
                drain_stock(stock);
+               css_get(&memcg->css);
                stock->cached = memcg;
        }
        stock->nr_pages += nr_pages;
@@ -2166,6 +2311,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                if (memcg && stock->nr_pages &&
                    mem_cgroup_is_descendant(memcg, root_memcg))
                        flush = true;
+               if (obj_stock_flush_required(stock, root_memcg))
+                       flush = true;
                rcu_read_unlock();
 
                if (flush &&
@@ -2228,18 +2375,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
        return 0;
 }
 
-static void reclaim_high(struct mem_cgroup *memcg,
-                        unsigned int nr_pages,
-                        gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+                                 unsigned int nr_pages,
+                                 gfp_t gfp_mask)
 {
+       unsigned long nr_reclaimed = 0;
+
        do {
+               unsigned long pflags;
+
                if (page_counter_read(&memcg->memory) <=
                    READ_ONCE(memcg->memory.high))
                        continue;
+
                memcg_memory_event(memcg, MEMCG_HIGH);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+               psi_memstall_enter(&pflags);
+               nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+                                                            gfp_mask, true);
+               psi_memstall_leave(&pflags);
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
+
+       return nr_reclaimed;
 }
 
 static void high_work_func(struct work_struct *work)
@@ -2264,7 +2422,7 @@ static void high_work_func(struct work_struct *work)
  *
  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
  *   overage ratio to a delay.
- * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
  *   to produce a reasonable delay curve.
  *
@@ -2395,16 +2553,32 @@ void mem_cgroup_handle_over_high(void)
 {
        unsigned long penalty_jiffies;
        unsigned long pflags;
+       unsigned long nr_reclaimed;
        unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *memcg;
+       bool in_retry = false;
 
        if (likely(!nr_pages))
                return;
 
        memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
        current->memcg_nr_pages_over_high = 0;
 
+retry_reclaim:
+       /*
+        * The allocating task should reclaim at least the batch size, but for
+        * subsequent retries we only want to do what's necessary to prevent oom
+        * or breaching resource isolation.
+        *
+        * This is distinct from memory.max or page allocator behaviour because
+        * memory.high is currently batched, whereas memory.max and the page
+        * allocator run every time an allocation is made.
+        */
+       nr_reclaimed = reclaim_high(memcg,
+                                   in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+                                   GFP_KERNEL);
+
        /*
         * memory.high is breached and reclaim is unable to keep up. Throttle
         * allocators proactively to slow down excessive growth.
@@ -2431,6 +2605,16 @@ void mem_cgroup_handle_over_high(void)
        if (penalty_jiffies <= HZ / 100)
                goto out;
 
+       /*
+        * If reclaim is making forward progress but we're still over
+        * memory.high, we want to encourage that rather than doing allocator
+        * throttling.
+        */
+       if (nr_reclaimed || nr_retries--) {
+               in_retry = true;
+               goto retry_reclaim;
+       }
+
        /*
         * If we exit early, we're guaranteed to die (since
         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
@@ -2448,13 +2632,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *mem_over_limit;
        struct page_counter *counter;
+       enum oom_status oom_status;
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       enum oom_status oom_status;
+       unsigned long pflags;
 
        if (mem_cgroup_is_root(memcg))
                return 0;
@@ -2514,8 +2699,10 @@ retry:
 
        memcg_memory_event(mem_over_limit, MEMCG_MAX);
 
+       psi_memstall_enter(&pflags);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
+       psi_memstall_leave(&pflags);
 
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                goto retry;
@@ -2567,7 +2754,7 @@ retry:
                       get_order(nr_pages * PAGE_SIZE));
        switch (oom_status) {
        case OOM_SUCCESS:
-               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               nr_retries = MAX_RECLAIM_RETRIES;
                goto retry;
        case OOM_FAILED:
                goto force;
@@ -2586,12 +2773,10 @@ force:
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_charge(&memcg->memsw, nr_pages);
-       css_get_many(&memcg->css, nr_pages);
 
        return 0;
 
 done_restock:
-       css_get_many(&memcg->css, batch);
        if (batch > nr_pages)
                refill_stock(memcg, batch - nr_pages);
 
@@ -2649,8 +2834,6 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
        page_counter_uncharge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_uncharge(&memcg->memsw, nr_pages);
-
-       css_put_many(&memcg->css, nr_pages);
 }
 #endif
 
@@ -2669,6 +2852,26 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg)
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+                                gfp_t gfp)
+{
+       unsigned int objects = objs_per_slab_page(s, page);
+       void *vec;
+
+       vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+                          page_to_nid(page));
+       if (!vec)
+               return -ENOMEM;
+
+       if (cmpxchg(&page->obj_cgroups, NULL,
+                   (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+               kfree(vec);
+       else
+               kmemleak_not_leak(vec);
+
+       return 0;
+}
+
 /*
  * Returns a pointer to the memory cgroup to which the kernel object is charged.
  *
@@ -2685,17 +2888,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
        page = virt_to_head_page(p);
 
        /*
-        * Slab pages don't have page->mem_cgroup set because corresponding
-        * kmem caches can be reparented during the lifetime. That's why
-        * memcg_from_slab_page() should be used instead.
+        * Slab objects are accounted individually, not per-page.
+        * Memcg membership data for each individual object is saved in
+        * the page->obj_cgroups.
         */
-       if (PageSlab(page))
-               return memcg_from_slab_page(page);
+       if (page_has_obj_cgroups(page)) {
+               struct obj_cgroup *objcg;
+               unsigned int off;
+
+               off = obj_to_index(page->slab_cache, page, p);
+               objcg = page_obj_cgroups(page)[off];
+               if (objcg)
+                       return obj_cgroup_memcg(objcg);
+
+               return NULL;
+       }
 
        /* All other pages use page->mem_cgroup */
        return page->mem_cgroup;
 }
 
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+       struct obj_cgroup *objcg = NULL;
+       struct mem_cgroup *memcg;
+
+       if (unlikely(!current->mm && !current->active_memcg))
+               return NULL;
+
+       rcu_read_lock();
+       if (unlikely(current->active_memcg))
+               memcg = rcu_dereference(current->active_memcg);
+       else
+               memcg = mem_cgroup_from_task(current);
+
+       for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+               objcg = rcu_dereference(memcg->objcg);
+               if (objcg && obj_cgroup_tryget(objcg))
+                       break;
+       }
+       rcu_read_unlock();
+
+       return objcg;
+}
+
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -2721,9 +2957,7 @@ static int memcg_alloc_cache_id(void)
        else if (size > MEMCG_CACHES_MAX_SIZE)
                size = MEMCG_CACHES_MAX_SIZE;
 
-       err = memcg_update_all_caches(size);
-       if (!err)
-               err = memcg_update_all_list_lrus(size);
+       err = memcg_update_all_list_lrus(size);
        if (!err)
                memcg_nr_cache_ids = size;
 
@@ -2741,150 +2975,6 @@ static void memcg_free_cache_id(int id)
        ida_simple_remove(&memcg_cache_ida, id);
 }
 
-struct memcg_kmem_cache_create_work {
-       struct mem_cgroup *memcg;
-       struct kmem_cache *cachep;
-       struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
-       struct memcg_kmem_cache_create_work *cw =
-               container_of(w, struct memcg_kmem_cache_create_work, work);
-       struct mem_cgroup *memcg = cw->memcg;
-       struct kmem_cache *cachep = cw->cachep;
-
-       memcg_create_kmem_cache(memcg, cachep);
-
-       css_put(&memcg->css);
-       kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
-                                              struct kmem_cache *cachep)
-{
-       struct memcg_kmem_cache_create_work *cw;
-
-       if (!css_tryget_online(&memcg->css))
-               return;
-
-       cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
-       if (!cw) {
-               css_put(&memcg->css);
-               return;
-       }
-
-       cw->memcg = memcg;
-       cw->cachep = cachep;
-       INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
-       queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
-       if (in_interrupt())
-               return true;
-
-       /* Allow remote memcg charging in kthread contexts. */
-       if ((!current->mm || (current->flags & PF_KTHREAD)) &&
-            !current->active_memcg)
-               return true;
-       return false;
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
-       struct mem_cgroup *memcg;
-       struct kmem_cache *memcg_cachep;
-       struct memcg_cache_array *arr;
-       int kmemcg_id;
-
-       VM_BUG_ON(!is_root_cache(cachep));
-
-       if (memcg_kmem_bypass())
-               return cachep;
-
-       rcu_read_lock();
-
-       if (unlikely(current->active_memcg))
-               memcg = current->active_memcg;
-       else
-               memcg = mem_cgroup_from_task(current);
-
-       if (!memcg || memcg == root_mem_cgroup)
-               goto out_unlock;
-
-       kmemcg_id = READ_ONCE(memcg->kmemcg_id);
-       if (kmemcg_id < 0)
-               goto out_unlock;
-
-       arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
-       /*
-        * Make sure we will access the up-to-date value. The code updating
-        * memcg_caches issues a write barrier to match the data dependency
-        * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
-        */
-       memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
-       /*
-        * If we are in a safe context (can wait, and not in interrupt
-        * context), we could be be predictable and return right away.
-        * This would guarantee that the allocation being performed
-        * already belongs in the new cache.
-        *
-        * However, there are some clashes that can arrive from locking.
-        * For instance, because we acquire the slab_mutex while doing
-        * memcg_create_kmem_cache, this means no further allocation
-        * could happen with the slab_mutex held. So it's better to
-        * defer everything.
-        *
-        * If the memcg is dying or memcg_cache is about to be released,
-        * don't bother creating new kmem_caches. Because memcg_cachep
-        * is ZEROed as the fist step of kmem offlining, we don't need
-        * percpu_ref_tryget_live() here. css_tryget_online() check in
-        * memcg_schedule_kmem_cache_create() will prevent us from
-        * creation of a new kmem_cache.
-        */
-       if (unlikely(!memcg_cachep))
-               memcg_schedule_kmem_cache_create(memcg, cachep);
-       else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
-               cachep = memcg_cachep;
-out_unlock:
-       rcu_read_unlock();
-       return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
-       if (!is_root_cache(cachep))
-               percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
 /**
  * __memcg_kmem_charge: charge a number of kernel pages to a memcg
  * @memcg: memory cgroup to charge
@@ -2958,6 +3048,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
                if (!ret) {
                        page->mem_cgroup = memcg;
                        __SetPageKmemcg(page);
+                       return 0;
                }
        }
        css_put(&memcg->css);
@@ -2980,13 +3071,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
        __memcg_kmem_uncharge(memcg, nr_pages);
        page->mem_cgroup = NULL;
+       css_put(&memcg->css);
 
        /* slab pages do not have PageKmemcg flag set */
        if (PageKmemcg(page))
                __ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+       struct memcg_stock_pcp *stock;
+       unsigned long flags;
+       bool ret = false;
+
+       local_irq_save(flags);
 
-       css_put_many(&memcg->css, nr_pages);
+       stock = this_cpu_ptr(&memcg_stock);
+       if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+               stock->nr_bytes -= nr_bytes;
+               ret = true;
+       }
+
+       local_irq_restore(flags);
+
+       return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+       struct obj_cgroup *old = stock->cached_objcg;
+
+       if (!old)
+               return;
+
+       if (stock->nr_bytes) {
+               unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+               unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+               if (nr_pages) {
+                       rcu_read_lock();
+                       __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
+                       rcu_read_unlock();
+               }
+
+               /*
+                * The leftover is flushed to the centralized per-memcg value.
+                * On the next attempt to refill obj stock it will be moved
+                * to a per-cpu stock (probably, on an other CPU), see
+                * refill_obj_stock().
+                *
+                * How often it's flushed is a trade-off between the memory
+                * limit enforcement accuracy and potential CPU contention,
+                * so it might be changed in the future.
+                */
+               atomic_add(nr_bytes, &old->nr_charged_bytes);
+               stock->nr_bytes = 0;
+       }
+
+       obj_cgroup_put(old);
+       stock->cached_objcg = NULL;
 }
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+                                    struct mem_cgroup *root_memcg)
+{
+       struct mem_cgroup *memcg;
+
+       if (stock->cached_objcg) {
+               memcg = obj_cgroup_memcg(stock->cached_objcg);
+               if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+                       return true;
+       }
+
+       return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+       struct memcg_stock_pcp *stock;
+       unsigned long flags;
+
+       local_irq_save(flags);
+
+       stock = this_cpu_ptr(&memcg_stock);
+       if (stock->cached_objcg != objcg) { /* reset if necessary */
+               drain_obj_stock(stock);
+               obj_cgroup_get(objcg);
+               stock->cached_objcg = objcg;
+               stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+       }
+       stock->nr_bytes += nr_bytes;
+
+       if (stock->nr_bytes > PAGE_SIZE)
+               drain_obj_stock(stock);
+
+       local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+       struct mem_cgroup *memcg;
+       unsigned int nr_pages, nr_bytes;
+       int ret;
+
+       if (consume_obj_stock(objcg, size))
+               return 0;
+
+       /*
+        * In theory, memcg->nr_charged_bytes can have enough
+        * pre-charged bytes to satisfy the allocation. However,
+        * flushing memcg->nr_charged_bytes requires two atomic
+        * operations, and memcg->nr_charged_bytes can't be big,
+        * so it's better to ignore it and try grab some new pages.
+        * memcg->nr_charged_bytes will be flushed in
+        * refill_obj_stock(), called from this function or
+        * independently later.
+        */
+       rcu_read_lock();
+       memcg = obj_cgroup_memcg(objcg);
+       css_get(&memcg->css);
+       rcu_read_unlock();
+
+       nr_pages = size >> PAGE_SHIFT;
+       nr_bytes = size & (PAGE_SIZE - 1);
+
+       if (nr_bytes)
+               nr_pages += 1;
+
+       ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+       if (!ret && nr_bytes)
+               refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+       css_put(&memcg->css);
+       return ret;
+}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+       refill_obj_stock(objcg, size);
+}
+
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -2997,13 +3221,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
  */
 void mem_cgroup_split_huge_fixup(struct page *head)
 {
+       struct mem_cgroup *memcg = head->mem_cgroup;
        int i;
 
        if (mem_cgroup_disabled())
                return;
 
-       for (i = 1; i < HPAGE_PMD_NR; i++)
-               head[i].mem_cgroup = head->mem_cgroup;
+       for (i = 1; i < HPAGE_PMD_NR; i++) {
+               css_get(&memcg->css);
+               head[i].mem_cgroup = memcg;
+       }
 }
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
@@ -3207,7 +3434,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 {
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
 
        /* we call try-to-free pages for make this cgroup empty */
        lru_add_drain_all();
@@ -3404,6 +3631,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
+       struct obj_cgroup *objcg;
        int memcg_id;
 
        if (cgroup_memory_nokmem)
@@ -3416,7 +3644,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
        if (memcg_id < 0)
                return memcg_id;
 
-       static_branch_inc(&memcg_kmem_enabled_key);
+       objcg = obj_cgroup_alloc();
+       if (!objcg) {
+               memcg_free_cache_id(memcg_id);
+               return -ENOMEM;
+       }
+       objcg->memcg = memcg;
+       rcu_assign_pointer(memcg->objcg, objcg);
+
+       static_branch_enable(&memcg_kmem_enabled_key);
+
        /*
         * A memory cgroup is considered kmem-online as soon as it gets
         * kmemcg_id. Setting the id after enabling static branching will
@@ -3425,7 +3662,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
         */
        memcg->kmemcg_id = memcg_id;
        memcg->kmem_state = KMEM_ONLINE;
-       INIT_LIST_HEAD(&memcg->kmem_caches);
 
        return 0;
 }
@@ -3438,22 +3674,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
 
        if (memcg->kmem_state != KMEM_ONLINE)
                return;
-       /*
-        * Clear the online state before clearing memcg_caches array
-        * entries. The slab_mutex in memcg_deactivate_kmem_caches()
-        * guarantees that no cache will be created for this cgroup
-        * after we are done (see memcg_create_kmem_cache()).
-        */
+
        memcg->kmem_state = KMEM_ALLOCATED;
 
        parent = parent_mem_cgroup(memcg);
        if (!parent)
                parent = root_mem_cgroup;
 
-       /*
-        * Deactivate and reparent kmem_caches.
-        */
-       memcg_deactivate_kmem_caches(memcg, parent);
+       memcg_reparent_objcgs(memcg, parent);
 
        kmemcg_id = memcg->kmemcg_id;
        BUG_ON(kmemcg_id < 0);
@@ -3486,11 +3714,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
        /* css_alloc() failed, offlining didn't happen */
        if (unlikely(memcg->kmem_state == KMEM_ONLINE))
                memcg_offline_kmem(memcg);
-
-       if (memcg->kmem_state == KMEM_ALLOCATED) {
-               WARN_ON(!list_empty(&memcg->kmem_caches));
-               static_branch_dec(&memcg_kmem_enabled_key);
-       }
 }
 #else
 static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -4800,9 +5023,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
        (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
        {
                .name = "kmem.slabinfo",
-               .seq_start = memcg_slab_start,
-               .seq_next = memcg_slab_next,
-               .seq_stop = memcg_slab_stop,
                .seq_show = memcg_slab_show,
        },
 #endif
@@ -4917,13 +5137,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return 1;
 
-       pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+       pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+                                                GFP_KERNEL_ACCOUNT);
        if (!pn->lruvec_stat_local) {
                kfree(pn);
                return 1;
        }
 
-       pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+       pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+                                              GFP_KERNEL_ACCOUNT);
        if (!pn->lruvec_stat_cpu) {
                free_percpu(pn->lruvec_stat_local);
                kfree(pn);
@@ -4997,11 +5219,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                goto fail;
        }
 
-       memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+       memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+                                               GFP_KERNEL_ACCOUNT);
        if (!memcg->vmstats_local)
                goto fail;
 
-       memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+       memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+                                                GFP_KERNEL_ACCOUNT);
        if (!memcg->vmstats_percpu)
                goto fail;
 
@@ -5022,6 +5246,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        memcg->socket_pressure = jiffies;
 #ifdef CONFIG_MEMCG_KMEM
        memcg->kmemcg_id = -1;
+       INIT_LIST_HEAD(&memcg->objcg_list);
 #endif
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&memcg->cgwb_list);
@@ -5049,7 +5274,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        struct mem_cgroup *memcg;
        long error = -ENOMEM;
 
+       memalloc_use_memcg(parent);
        memcg = mem_cgroup_alloc();
+       memalloc_unuse_memcg();
        if (IS_ERR(memcg))
                return ERR_CAST(memcg);
 
@@ -5084,9 +5311,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 
        /* The following stuff does not apply to the root */
        if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
-               INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
                root_mem_cgroup = memcg;
                return &memcg->css;
        }
@@ -5365,7 +5589,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        struct lruvec *from_vec, *to_vec;
        struct pglist_data *pgdat;
-       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+       unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
        int ret;
 
        VM_BUG_ON(from == to);
@@ -5448,7 +5672,10 @@ static int mem_cgroup_move_account(struct page *page,
         */
        smp_mb();
 
-       page->mem_cgroup = to;  /* caller should have done css_get */
+       css_get(&to->css);
+       css_put(&from->css);
+
+       page->mem_cgroup = to;
 
        __unlock_page_memcg(from);
 
@@ -5669,8 +5896,6 @@ static void __mem_cgroup_clear_mc(void)
                if (!mem_cgroup_is_root(mc.to))
                        page_counter_uncharge(&mc.to->memory, mc.moved_swap);
 
-               css_put_many(&mc.to->css, mc.moved_swap);
-
                mc.moved_swap = 0;
        }
        memcg_oom_recover(from);
@@ -6036,7 +6261,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_retries = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long high;
        int err;
@@ -6046,8 +6271,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
-       page_counter_set_high(&memcg->memory, high);
-
        for (;;) {
                unsigned long nr_pages = page_counter_read(&memcg->memory);
                unsigned long reclaimed;
@@ -6071,6 +6294,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                        break;
        }
 
+       page_counter_set_high(&memcg->memory, high);
+
+       memcg_wb_domain_size_changed(memcg);
+
        return nbytes;
 }
 
@@ -6084,7 +6311,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long max;
        int err;
@@ -6391,40 +6618,42 @@ static unsigned long effective_protection(unsigned long usage,
  *
  * WARNING: This function is not stateless! It can only be used as part
  *          of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- *   MEMCG_PROT_NONE: cgroup memory is not protected
- *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
- *     an unprotected supply of reclaimable memory from other cgroups.
- *   MEMCG_PROT_MIN: cgroup memory is protected
  */
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
-                                               struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                    struct mem_cgroup *memcg)
 {
        unsigned long usage, parent_usage;
        struct mem_cgroup *parent;
 
        if (mem_cgroup_disabled())
-               return MEMCG_PROT_NONE;
+               return;
 
        if (!root)
                root = root_mem_cgroup;
+
+       /*
+        * Effective values of the reclaim targets are ignored so they
+        * can be stale. Have a look at mem_cgroup_protection for more
+        * details.
+        * TODO: calculation should be more robust so that we do not need
+        * that special casing.
+        */
        if (memcg == root)
-               return MEMCG_PROT_NONE;
+               return;
 
        usage = page_counter_read(&memcg->memory);
        if (!usage)
-               return MEMCG_PROT_NONE;
+               return;
 
        parent = parent_mem_cgroup(memcg);
        /* No parent means a non-hierarchical mode on v1 memcg */
        if (!parent)
-               return MEMCG_PROT_NONE;
+               return;
 
        if (parent == root) {
                memcg->memory.emin = READ_ONCE(memcg->memory.min);
                memcg->memory.elow = READ_ONCE(memcg->memory.low);
-               goto out;
+               return;
        }
 
        parent_usage = page_counter_read(&parent->memory);
@@ -6438,14 +6667,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
                        READ_ONCE(memcg->memory.low),
                        READ_ONCE(parent->memory.elow),
                        atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
-       if (usage <= memcg->memory.emin)
-               return MEMCG_PROT_MIN;
-       else if (usage <= memcg->memory.elow)
-               return MEMCG_PROT_LOW;
-       else
-               return MEMCG_PROT_NONE;
 }
 
 /**
@@ -6461,7 +6682,7 @@ out:
  */
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
-       unsigned int nr_pages = hpage_nr_pages(page);
+       unsigned int nr_pages = thp_nr_pages(page);
        struct mem_cgroup *memcg = NULL;
        int ret = 0;
 
@@ -6498,6 +6719,7 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
        if (ret)
                goto out_put;
 
+       css_get(&memcg->css);
        commit_charge(page, memcg);
 
        local_irq_disable();
@@ -6552,9 +6774,6 @@ static void uncharge_batch(const struct uncharge_gather *ug)
        __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
        memcg_check_events(ug->memcg, ug->dummy_page);
        local_irq_restore(flags);
-
-       if (!mem_cgroup_is_root(ug->memcg))
-               css_put_many(&ug->memcg->css, ug->nr_pages);
 }
 
 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
@@ -6592,6 +6811,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 
        ug->dummy_page = page;
        page->mem_cgroup = NULL;
+       css_put(&ug->memcg->css);
 }
 
 static void uncharge_list(struct list_head *page_list)
@@ -6692,13 +6912,13 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
                return;
 
        /* Force-charge the new page. The old one will be freed soon */
-       nr_pages = hpage_nr_pages(newpage);
+       nr_pages = thp_nr_pages(newpage);
 
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
                page_counter_charge(&memcg->memsw, nr_pages);
-       css_get_many(&memcg->css, nr_pages);
 
+       css_get(&memcg->css);
        commit_charge(newpage, memcg);
 
        local_irq_save(flags);
@@ -6821,17 +7041,6 @@ static int __init mem_cgroup_init(void)
 {
        int cpu, node;
 
-#ifdef CONFIG_MEMCG_KMEM
-       /*
-        * Kmem cache creation is mostly done with the slab_mutex held,
-        * so use a workqueue with limited concurrency to avoid stalling
-        * all worker threads in case lots of cgroups are created and
-        * destroyed simultaneously.
-        */
-       memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
-       BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
        cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
                                  memcg_hotplug_cpu_dead);
 
@@ -6905,7 +7114,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
-       nr_entries = hpage_nr_pages(page);
+       nr_entries = thp_nr_pages(page);
        /* Get references for the tail pages, too */
        if (nr_entries > 1)
                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
@@ -6935,8 +7144,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
        mem_cgroup_charge_statistics(memcg, page, -nr_entries);
        memcg_check_events(memcg, page);
 
-       if (!mem_cgroup_is_root(memcg))
-               css_put_many(&memcg->css, nr_entries);
+       css_put(&memcg->css);
 }
 
 /**
@@ -6950,7 +7158,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  */
 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 {
-       unsigned int nr_pages = hpage_nr_pages(page);
+       unsigned int nr_pages = thp_nr_pages(page);
        struct page_counter *counter;
        struct mem_cgroup *memcg;
        unsigned short oldid;