memcg: free memcg_caches slot on css offline
[linux-2.6-microblaze.git] / mm / memcontrol.c
index ee97c9a..abfe013 100644 (file)
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 static struct mem_cgroup *root_mem_cgroup __read_mostly;
 
+/* Whether the swap controller is active */
 #ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
 int do_swap_account __read_mostly;
-
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
 #else
 #define do_swap_account                0
 #endif
 
-
 static const char * const mem_cgroup_stat_names[] = {
        "cache",
        "rss",
@@ -341,8 +332,9 @@ struct mem_cgroup {
        struct cg_proto tcp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
        int kmemcg_id;
+       bool kmem_acct_active;
 #endif
 
        int last_scanned_node;
@@ -361,29 +353,26 @@ struct mem_cgroup {
 };
 
 #ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
-       return memcg->kmemcg_id >= 0;
+       return memcg->kmem_acct_active;
 }
 #endif
 
 /* Stuffs for move charges at task migration. */
 /*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved.
  */
-enum move_type {
-       MOVE_CHARGE_TYPE_ANON,  /* private anonymous page and swap of it */
-       MOVE_CHARGE_TYPE_FILE,  /* file page(including tmpfs) and swap of it */
-       NR_MOVE_TYPE,
-};
+#define MOVE_ANON      0x1U
+#define MOVE_FILE      0x2U
+#define MOVE_MASK      (MOVE_ANON | MOVE_FILE)
 
 /* "mc" and its members are protected by cgroup_mutex */
 static struct move_charge_struct {
        spinlock_t        lock; /* for from, to */
        struct mem_cgroup *from;
        struct mem_cgroup *to;
-       unsigned long immigrate_flags;
+       unsigned long flags;
        unsigned long precharge;
        unsigned long moved_charge;
        unsigned long moved_swap;
@@ -394,16 +383,6 @@ static struct move_charge_struct {
        .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 };
 
-static bool move_anon(void)
-{
-       return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-
-static bool move_file(void)
-{
-       return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
-
 /*
  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
  * limit reclaim to prevent infinite loops, if they ever occur.
@@ -553,19 +532,31 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * The main reason for not using cgroup id for this:
  *  this works better in sparse environments, where we have a lot of memcgs,
  *  but only a few kmem-limited. Or also, if we have, for instance, 200
  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
  *  200 entry array for that.
  *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
  */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+       down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+       up_read(&memcg_cache_ids_sem);
+}
 
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -595,7 +586,7 @@ static void memcg_free_cache_id(int id);
 
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-       if (memcg_kmem_is_active(memcg)) {
+       if (memcg->kmemcg_id >= 0) {
                static_key_slow_dec(&memcg_kmem_enabled_key);
                memcg_free_cache_id(memcg->kmemcg_id);
        }
@@ -1569,7 +1560,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
         * quickly exit and free its memory.
         */
        if (fatal_signal_pending(current) || task_will_free_mem(current)) {
-               set_thread_flag(TIF_MEMDIE);
+               mark_tsk_oom_victim(current);
                return;
        }
 
@@ -1943,7 +1934,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
        if (!memcg)
                return false;
 
-       if (!handle)
+       if (!handle || oom_killer_disabled)
                goto cleanup;
 
        owait.memcg = memcg;
@@ -2151,17 +2142,6 @@ static void drain_local_stock(struct work_struct *dummy)
        clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
 }
 
-static void __init memcg_stock_init(void)
-{
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct memcg_stock_pcp *stock =
-                                       &per_cpu(memcg_stock, cpu);
-               INIT_WORK(&stock->work, drain_local_stock);
-       }
-}
-
 /*
  * Cache charges(val) to local per_cpu area.
  * This will be consumed by consume_stock() function, later.
@@ -2571,18 +2551,19 @@ static int memcg_alloc_cache_id(void)
        int id, size;
        int err;
 
-       id = ida_simple_get(&kmem_limited_groups,
+       id = ida_simple_get(&memcg_cache_ida,
                            0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
        if (id < 0)
                return id;
 
-       if (id < memcg_limited_groups_array_size)
+       if (id < memcg_nr_cache_ids)
                return id;
 
        /*
         * There's no space for the new id in memcg_caches arrays,
         * so we have to grow them.
         */
+       down_write(&memcg_cache_ids_sem);
 
        size = 2 * (id + 1);
        if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2591,8 +2572,15 @@ static int memcg_alloc_cache_id(void)
                size = MEMCG_CACHES_MAX_SIZE;
 
        err = memcg_update_all_caches(size);
+       if (!err)
+               err = memcg_update_all_list_lrus(size);
+       if (!err)
+               memcg_nr_cache_ids = size;
+
+       up_write(&memcg_cache_ids_sem);
+
        if (err) {
-               ida_simple_remove(&kmem_limited_groups, id);
+               ida_simple_remove(&memcg_cache_ida, id);
                return err;
        }
        return id;
@@ -2600,17 +2588,7 @@ static int memcg_alloc_cache_id(void)
 
 static void memcg_free_cache_id(int id)
 {
-       ida_simple_remove(&kmem_limited_groups, id);
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-       memcg_limited_groups_array_size = num;
+       ida_simple_remove(&memcg_cache_ida, id);
 }
 
 struct memcg_kmem_cache_create_work {
@@ -2689,18 +2667,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
 {
        struct mem_cgroup *memcg;
        struct kmem_cache *memcg_cachep;
+       int kmemcg_id;
 
-       VM_BUG_ON(!cachep->memcg_params);
-       VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+       VM_BUG_ON(!is_root_cache(cachep));
 
        if (current->memcg_kmem_skip_account)
                return cachep;
 
        memcg = get_mem_cgroup_from_mm(current->mm);
-       if (!memcg_kmem_is_active(memcg))
+       kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+       if (kmemcg_id < 0)
                goto out;
 
-       memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+       memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
        if (likely(memcg_cachep))
                return memcg_cachep;
 
@@ -2725,7 +2704,7 @@ out:
 void __memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
        if (!is_root_cache(cachep))
-               css_put(&cachep->memcg_params->memcg->css);
+               css_put(&cachep->memcg_params.memcg->css);
 }
 
 /*
@@ -2790,6 +2769,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
        memcg_uncharge_kmem(memcg, 1 << order);
        page->mem_cgroup = NULL;
 }
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+       struct mem_cgroup *memcg = NULL;
+       struct kmem_cache *cachep;
+       struct page *page;
+
+       page = virt_to_head_page(ptr);
+       if (PageSlab(page)) {
+               cachep = page->slab_cache;
+               if (!is_root_cache(cachep))
+                       memcg = cachep->memcg_params.memcg;
+       } else
+               /* page allocated by alloc_kmem_pages */
+               memcg = page->mem_cgroup;
+
+       return memcg;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3324,8 +3321,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
        int err = 0;
        int memcg_id;
 
-       if (memcg_kmem_is_active(memcg))
-               return 0;
+       BUG_ON(memcg->kmemcg_id >= 0);
+       BUG_ON(memcg->kmem_acct_active);
 
        /*
         * For simplicity, we won't allow this to be disabled.  It also can't
@@ -3368,6 +3365,7 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
         * patched.
         */
        memcg->kmemcg_id = memcg_id;
+       memcg->kmem_acct_active = true;
 out:
        return err;
 }
@@ -3500,7 +3498,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-       if (val >= (1 << NR_MOVE_TYPE))
+       if (val & ~MOVE_MASK)
                return -EINVAL;
 
        /*
@@ -4047,6 +4045,22 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return mem_cgroup_sockets_init(memcg, ss);
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+       if (!memcg->kmem_acct_active)
+               return;
+
+       /*
+        * Clear the 'active' flag before clearing memcg_caches arrays entries.
+        * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+        * guarantees no cache will be created for this cgroup after we are
+        * done (see memcg_create_kmem_cache()).
+        */
+       memcg->kmem_acct_active = false;
+
+       memcg_deactivate_kmem_caches(memcg);
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
        memcg_destroy_kmem_caches(memcg);
@@ -4058,6 +4072,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
        return 0;
 }
 
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
 static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
 }
@@ -4397,34 +4415,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
        { },    /* terminate */
 };
 
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
-       {
-               .name = "memsw.usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.max_usage_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
-               .write = mem_cgroup_reset,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.limit_in_bytes",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
-               .write = mem_cgroup_write,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       {
-               .name = "memsw.failcnt",
-               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
-               .write = mem_cgroup_reset,
-               .read_u64 = mem_cgroup_read_u64,
-       },
-       { },    /* terminate */
-};
-#endif
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 {
        struct mem_cgroup_per_node *pn;
@@ -4520,29 +4510,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(parent_mem_cgroup);
 
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
-       struct mem_cgroup_tree_per_node *rtpn;
-       struct mem_cgroup_tree_per_zone *rtpz;
-       int tmp, node, zone;
-
-       for_each_node(node) {
-               tmp = node;
-               if (!node_state(node, N_NORMAL_MEMORY))
-                       tmp = -1;
-               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
-               BUG_ON(!rtpn);
-
-               soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
-               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
-                       rtpz = &rtpn->rb_tree_per_zone[zone];
-                       rtpz->rb_root = RB_ROOT;
-                       spin_lock_init(&rtpz->lock);
-               }
-       }
-}
-
 static struct cgroup_subsys_state * __ref
 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -4665,6 +4632,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
        spin_unlock(&memcg->event_list_lock);
 
        vmpressure_cleanup(&memcg->vmpressure);
+
+       memcg_deactivate_kmem(memcg);
 }
 
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4773,12 +4742,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
        if (!page || !page_mapped(page))
                return NULL;
        if (PageAnon(page)) {
-               /* we don't move shared anon */
-               if (!move_anon())
+               if (!(mc.flags & MOVE_ANON))
                        return NULL;
-       } else if (!move_file())
-               /* we ignore mapcount for file pages */
-               return NULL;
+       } else {
+               if (!(mc.flags & MOVE_FILE))
+                       return NULL;
+       }
        if (!get_page_unless_zero(page))
                return NULL;
 
@@ -4792,7 +4761,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
 
-       if (!move_anon() || non_swap_entry(ent))
+       if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
                return NULL;
        /*
         * Because lookup_swap_cache() updates some statistics counter,
@@ -4821,7 +4790,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 
        if (!vma->vm_file) /* anonymous vma */
                return NULL;
-       if (!move_file())
+       if (!(mc.flags & MOVE_FILE))
                return NULL;
 
        mapping = vma->vm_file->f_mapping;
@@ -4900,7 +4869,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
 
        page = pmd_page(pmd);
        VM_BUG_ON_PAGE(!page || !PageHead(page), page);
-       if (!move_anon())
+       if (!(mc.flags & MOVE_ANON))
                return ret;
        if (page->mem_cgroup == mc.from) {
                ret = MC_TARGET_PAGE;
@@ -4923,7 +4892,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
                                        unsigned long addr, unsigned long end,
                                        struct mm_walk *walk)
 {
-       struct vm_area_struct *vma = walk->private;
+       struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
 
@@ -4949,20 +4918,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
        unsigned long precharge;
-       struct vm_area_struct *vma;
 
+       struct mm_walk mem_cgroup_count_precharge_walk = {
+               .pmd_entry = mem_cgroup_count_precharge_pte_range,
+               .mm = mm,
+       };
        down_read(&mm->mmap_sem);
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               struct mm_walk mem_cgroup_count_precharge_walk = {
-                       .pmd_entry = mem_cgroup_count_precharge_pte_range,
-                       .mm = mm,
-                       .private = vma,
-               };
-               if (is_vm_hugetlb_page(vma))
-                       continue;
-               walk_page_range(vma->vm_start, vma->vm_end,
-                                       &mem_cgroup_count_precharge_walk);
-       }
+       walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
        up_read(&mm->mmap_sem);
 
        precharge = mc.precharge;
@@ -5042,15 +5004,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
        struct task_struct *p = cgroup_taskset_first(tset);
        int ret = 0;
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
-       unsigned long move_charge_at_immigrate;
+       unsigned long move_flags;
 
        /*
         * We are now commited to this value whatever it is. Changes in this
         * tunable will only affect upcoming migrations, not the current one.
         * So we need to save it, and keep it going.
         */
-       move_charge_at_immigrate  = memcg->move_charge_at_immigrate;
-       if (move_charge_at_immigrate) {
+       move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+       if (move_flags) {
                struct mm_struct *mm;
                struct mem_cgroup *from = mem_cgroup_from_task(p);
 
@@ -5070,7 +5032,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
                        spin_lock(&mc.lock);
                        mc.from = from;
                        mc.to = memcg;
-                       mc.immigrate_flags = move_charge_at_immigrate;
+                       mc.flags = move_flags;
                        spin_unlock(&mc.lock);
                        /* We set mc.moving_task later */
 
@@ -5095,7 +5057,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
                                struct mm_walk *walk)
 {
        int ret = 0;
-       struct vm_area_struct *vma = walk->private;
+       struct vm_area_struct *vma = walk->vma;
        pte_t *pte;
        spinlock_t *ptl;
        enum mc_target_type target_type;
@@ -5191,7 +5153,10 @@ put:                     /* get_mctgt_type() gets the page */
 
 static void mem_cgroup_move_charge(struct mm_struct *mm)
 {
-       struct vm_area_struct *vma;
+       struct mm_walk mem_cgroup_move_charge_walk = {
+               .pmd_entry = mem_cgroup_move_charge_pte_range,
+               .mm = mm,
+       };
 
        lru_add_drain_all();
        /*
@@ -5214,24 +5179,11 @@ retry:
                cond_resched();
                goto retry;
        }
-       for (vma = mm->mmap; vma; vma = vma->vm_next) {
-               int ret;
-               struct mm_walk mem_cgroup_move_charge_walk = {
-                       .pmd_entry = mem_cgroup_move_charge_pte_range,
-                       .mm = mm,
-                       .private = vma,
-               };
-               if (is_vm_hugetlb_page(vma))
-                       continue;
-               ret = walk_page_range(vma->vm_start, vma->vm_end,
-                                               &mem_cgroup_move_charge_walk);
-               if (ret)
-                       /*
-                        * means we have consumed all precharges and failed in
-                        * doing additional charge. Just abandon here.
-                        */
-                       break;
-       }
+       /*
+        * When we have consumed all precharges and failed in doing
+        * additional charge, the page walk just aborts.
+        */
+       walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
        up_read(&mm->mmap_sem);
        atomic_dec(&mc.from->moving_account);
 }
@@ -5438,37 +5390,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
        .early_init = 0,
 };
 
-#ifdef CONFIG_MEMCG_SWAP
-static int __init enable_swap_account(char *s)
-{
-       if (!strcmp(s, "1"))
-               really_do_swap_account = 1;
-       else if (!strcmp(s, "0"))
-               really_do_swap_account = 0;
-       return 1;
-}
-__setup("swapaccount=", enable_swap_account);
-
-static void __init memsw_file_init(void)
-{
-       WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
-                                         memsw_cgroup_files));
-}
-
-static void __init enable_swap_cgroup(void)
-{
-       if (!mem_cgroup_disabled() && really_do_swap_account) {
-               do_swap_account = 1;
-               memsw_file_init();
-       }
-}
-
-#else
-static void __init enable_swap_cgroup(void)
-{
-}
-#endif
-
 /**
  * mem_cgroup_events - count memory events against a cgroup
  * @memcg: the memory cgroup
@@ -5519,74 +5440,6 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
        return true;
 }
 
-#ifdef CONFIG_MEMCG_SWAP
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
-{
-       struct mem_cgroup *memcg;
-       unsigned short oldid;
-
-       VM_BUG_ON_PAGE(PageLRU(page), page);
-       VM_BUG_ON_PAGE(page_count(page), page);
-
-       if (!do_swap_account)
-               return;
-
-       memcg = page->mem_cgroup;
-
-       /* Readahead page, never charged */
-       if (!memcg)
-               return;
-
-       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
-       VM_BUG_ON_PAGE(oldid, page);
-       mem_cgroup_swap_statistics(memcg, true);
-
-       page->mem_cgroup = NULL;
-
-       if (!mem_cgroup_is_root(memcg))
-               page_counter_uncharge(&memcg->memory, 1);
-
-       /* XXX: caller holds IRQ-safe mapping->tree_lock */
-       VM_BUG_ON(!irqs_disabled());
-
-       mem_cgroup_charge_statistics(memcg, page, -1);
-       memcg_check_events(memcg, page);
-}
-
-/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
- * @entry: swap entry to uncharge
- *
- * Drop the memsw charge associated with @entry.
- */
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
-{
-       struct mem_cgroup *memcg;
-       unsigned short id;
-
-       if (!do_swap_account)
-               return;
-
-       id = swap_cgroup_record(entry, 0);
-       rcu_read_lock();
-       memcg = mem_cgroup_lookup(id);
-       if (memcg) {
-               if (!mem_cgroup_is_root(memcg))
-                       page_counter_uncharge(&memcg->memsw, 1);
-               mem_cgroup_swap_statistics(memcg, false);
-               css_put(&memcg->css);
-       }
-       rcu_read_unlock();
-}
-#endif
-
 /**
  * mem_cgroup_try_charge - try charging a page
  * @page: page to charge
@@ -5919,10 +5772,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
  */
 static int __init mem_cgroup_init(void)
 {
+       int cpu, node;
+
        hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
-       enable_swap_cgroup();
-       mem_cgroup_soft_limit_tree_init();
-       memcg_stock_init();
+
+       for_each_possible_cpu(cpu)
+               INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+                         drain_local_stock);
+
+       for_each_node(node) {
+               struct mem_cgroup_tree_per_node *rtpn;
+               int zone;
+
+               rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+                                   node_online(node) ? node : NUMA_NO_NODE);
+
+               for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+                       struct mem_cgroup_tree_per_zone *rtpz;
+
+                       rtpz = &rtpn->rb_tree_per_zone[zone];
+                       rtpz->rb_root = RB_ROOT;
+                       spin_lock_init(&rtpz->lock);
+               }
+               soft_limit_tree.rb_tree_per_node[node] = rtpn;
+       }
+
        return 0;
 }
 subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+       struct mem_cgroup *memcg;
+       unsigned short oldid;
+
+       VM_BUG_ON_PAGE(PageLRU(page), page);
+       VM_BUG_ON_PAGE(page_count(page), page);
+
+       if (!do_swap_account)
+               return;
+
+       memcg = page->mem_cgroup;
+
+       /* Readahead page, never charged */
+       if (!memcg)
+               return;
+
+       oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+       VM_BUG_ON_PAGE(oldid, page);
+       mem_cgroup_swap_statistics(memcg, true);
+
+       page->mem_cgroup = NULL;
+
+       if (!mem_cgroup_is_root(memcg))
+               page_counter_uncharge(&memcg->memory, 1);
+
+       /* XXX: caller holds IRQ-safe mapping->tree_lock */
+       VM_BUG_ON(!irqs_disabled());
+
+       mem_cgroup_charge_statistics(memcg, page, -1);
+       memcg_check_events(memcg, page);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+       struct mem_cgroup *memcg;
+       unsigned short id;
+
+       if (!do_swap_account)
+               return;
+
+       id = swap_cgroup_record(entry, 0);
+       rcu_read_lock();
+       memcg = mem_cgroup_lookup(id);
+       if (memcg) {
+               if (!mem_cgroup_is_root(memcg))
+                       page_counter_uncharge(&memcg->memsw, 1);
+               mem_cgroup_swap_statistics(memcg, false);
+               css_put(&memcg->css);
+       }
+       rcu_read_unlock();
+}
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+static int __init enable_swap_account(char *s)
+{
+       if (!strcmp(s, "1"))
+               really_do_swap_account = 1;
+       else if (!strcmp(s, "0"))
+               really_do_swap_account = 0;
+       return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static struct cftype memsw_cgroup_files[] = {
+       {
+               .name = "memsw.usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.max_usage_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+               .write = mem_cgroup_reset,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+               .write = mem_cgroup_write,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       {
+               .name = "memsw.failcnt",
+               .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+               .write = mem_cgroup_reset,
+               .read_u64 = mem_cgroup_read_u64,
+       },
+       { },    /* terminate */
+};
+
+static int __init mem_cgroup_swap_init(void)
+{
+       if (!mem_cgroup_disabled() && really_do_swap_account) {
+               do_swap_account = 1;
+               WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+                                                 memsw_cgroup_files));
+       }
+       return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */