Merge branches 'acpi-scan' and 'acpi-prm'
[linux-2.6-microblaze.git] / mm / memcontrol.c
index 389b576..b762215 100644 (file)
@@ -103,6 +103,14 @@ static bool do_memsw_account(void)
        return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 }
 
+/* memcg and lruvec stats flushing */
+static void flush_memcg_stats_dwork(struct work_struct *w);
+static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+static void flush_memcg_stats_work(struct work_struct *w);
+static DECLARE_WORK(stats_flush_work, flush_memcg_stats_work);
+static DEFINE_PER_CPU(unsigned int, stats_flush_threshold);
+static DEFINE_SPINLOCK(stats_flush_lock);
+
 #define THRESHOLDS_EVENTS_TARGET 128
 #define SOFTLIMIT_EVENTS_TARGET 1024
 
@@ -248,9 +256,9 @@ struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
        return &memcg->vmpressure;
 }
 
-struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
+struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 {
-       return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
+       return container_of(vmpr, struct mem_cgroup, vmpressure);
 }
 
 #ifdef CONFIG_MEMCG_KMEM
@@ -645,17 +653,6 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
        cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 }
 
-/* idx can be of type enum memcg_stat_item or node_stat_item. */
-static unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
-{
-       long x = READ_ONCE(memcg->vmstats.state[idx]);
-#ifdef CONFIG_SMP
-       if (x < 0)
-               x = 0;
-#endif
-       return x;
-}
-
 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 {
@@ -671,23 +668,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
        return x;
 }
 
-static struct mem_cgroup_per_node *
-parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
-{
-       struct mem_cgroup *parent;
-
-       parent = parent_mem_cgroup(pn->memcg);
-       if (!parent)
-               return NULL;
-       return parent->nodeinfo[nid];
-}
-
 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
                              int val)
 {
        struct mem_cgroup_per_node *pn;
        struct mem_cgroup *memcg;
-       long x, threshold = MEMCG_CHARGE_BATCH;
 
        pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
        memcg = pn->memcg;
@@ -696,21 +681,9 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        __mod_memcg_state(memcg, idx, val);
 
        /* Update lruvec */
-       __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
-
-       if (vmstat_item_in_bytes(idx))
-               threshold <<= PAGE_SHIFT;
-
-       x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
-       if (unlikely(abs(x) > threshold)) {
-               pg_data_t *pgdat = lruvec_pgdat(lruvec);
-               struct mem_cgroup_per_node *pi;
-
-               for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
-                       atomic_long_add(x, &pi->lruvec_stat[idx]);
-               x = 0;
-       }
-       __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+       if (!(__this_cpu_inc_return(stats_flush_threshold) % MEMCG_CHARGE_BATCH))
+               queue_work(system_unbound_wq, &stats_flush_work);
 }
 
 /**
@@ -905,7 +878,7 @@ EXPORT_SYMBOL(mem_cgroup_from_task);
 
 static __always_inline struct mem_cgroup *active_memcg(void)
 {
-       if (in_interrupt())
+       if (!in_task())
                return this_cpu_read(int_active_memcg);
        else
                return current->active_memcg;
@@ -2205,8 +2178,9 @@ static void drain_local_stock(struct work_struct *dummy)
        unsigned long flags;
 
        /*
-        * The only protection from memory hotplug vs. drain_stock races is
-        * that we always operate on local CPU stock here with IRQ disabled
+        * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
+        * drain_stock races is that we always operate on local CPU stock
+        * here with IRQ disabled
         */
        local_irq_save(flags);
 
@@ -2273,7 +2247,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
                if (memcg && stock->nr_pages &&
                    mem_cgroup_is_descendant(memcg, root_memcg))
                        flush = true;
-               if (obj_stock_flush_required(stock, root_memcg))
+               else if (obj_stock_flush_required(stock, root_memcg))
                        flush = true;
                rcu_read_unlock();
 
@@ -2289,40 +2263,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        mutex_unlock(&percpu_charge_mutex);
 }
 
-static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
-{
-       int nid;
-
-       for_each_node(nid) {
-               struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
-               unsigned long stat[NR_VM_NODE_STAT_ITEMS];
-               struct batched_lruvec_stat *lstatc;
-               int i;
-
-               lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
-               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
-                       stat[i] = lstatc->count[i];
-                       lstatc->count[i] = 0;
-               }
-
-               do {
-                       for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
-                               atomic_long_add(stat[i], &pn->lruvec_stat[i]);
-               } while ((pn = parent_nodeinfo(pn, nid)));
-       }
-}
-
 static int memcg_hotplug_cpu_dead(unsigned int cpu)
 {
        struct memcg_stock_pcp *stock;
-       struct mem_cgroup *memcg;
 
        stock = &per_cpu(memcg_stock, cpu);
        drain_stock(stock);
 
-       for_each_mem_cgroup(memcg)
-               memcg_flush_lruvec_page_state(memcg, cpu);
-
        return 0;
 }
 
@@ -4116,7 +4063,7 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
 
-       if (val > 100)
+       if (val > 200)
                return -EINVAL;
 
        if (!mem_cgroup_is_root(memcg))
@@ -4668,7 +4615,7 @@ void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
                    atomic_read(&frn->done.cnt) == 1) {
                        frn->at = 0;
                        trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
-                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
                                               WB_REASON_FOREIGN_FLUSH,
                                               &frn->done);
                }
@@ -4892,9 +4839,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
 
        vfs_poll(efile.file, &event->pt);
 
-       spin_lock(&memcg->event_list_lock);
+       spin_lock_irq(&memcg->event_list_lock);
        list_add(&event->list, &memcg->event_list);
-       spin_unlock(&memcg->event_list_lock);
+       spin_unlock_irq(&memcg->event_list_lock);
 
        fdput(cfile);
        fdput(efile);
@@ -5129,17 +5076,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return 1;
 
-       pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
-                                                GFP_KERNEL_ACCOUNT);
-       if (!pn->lruvec_stat_local) {
-               kfree(pn);
-               return 1;
-       }
-
-       pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
-                                              GFP_KERNEL_ACCOUNT);
-       if (!pn->lruvec_stat_cpu) {
-               free_percpu(pn->lruvec_stat_local);
+       pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
+                                                  GFP_KERNEL_ACCOUNT);
+       if (!pn->lruvec_stats_percpu) {
                kfree(pn);
                return 1;
        }
@@ -5160,8 +5099,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return;
 
-       free_percpu(pn->lruvec_stat_cpu);
-       free_percpu(pn->lruvec_stat_local);
+       free_percpu(pn->lruvec_stats_percpu);
        kfree(pn);
 }
 
@@ -5177,15 +5115,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
-       int cpu;
-
        memcg_wb_domain_exit(memcg);
-       /*
-        * Flush percpu lruvec stats to guarantee the value
-        * correctness on parent's and all ancestor levels.
-        */
-       for_each_online_cpu(cpu)
-               memcg_flush_lruvec_page_state(memcg, cpu);
        __mem_cgroup_free(memcg);
 }
 
@@ -5321,6 +5251,10 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
        /* Online state pins memcg ID, memcg ID pins CSS */
        refcount_set(&memcg->id.ref, 1);
        css_get(css);
+
+       if (unlikely(mem_cgroup_is_root(memcg)))
+               queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
+                                  2UL*HZ);
        return 0;
 }
 
@@ -5334,12 +5268,12 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
         * Notify userspace about cgroup removing only after rmdir of cgroup
         * directory to avoid race between userspace and kernelspace.
         */
-       spin_lock(&memcg->event_list_lock);
+       spin_lock_irq(&memcg->event_list_lock);
        list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
                list_del_init(&event->list);
                schedule_work(&event->remove);
        }
-       spin_unlock(&memcg->event_list_lock);
+       spin_unlock_irq(&memcg->event_list_lock);
 
        page_counter_set_min(&memcg->memory, 0);
        page_counter_set_low(&memcg->memory, 0);
@@ -5412,13 +5346,33 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        memcg_wb_domain_size_changed(memcg);
 }
 
+void mem_cgroup_flush_stats(void)
+{
+       if (!spin_trylock(&stats_flush_lock))
+               return;
+
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       spin_unlock(&stats_flush_lock);
+}
+
+static void flush_memcg_stats_dwork(struct work_struct *w)
+{
+       mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+}
+
+static void flush_memcg_stats_work(struct work_struct *w)
+{
+       mem_cgroup_flush_stats();
+}
+
 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
        struct mem_cgroup *parent = parent_mem_cgroup(memcg);
        struct memcg_vmstats_percpu *statc;
        long delta, v;
-       int i;
+       int i, nid;
 
        statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
 
@@ -5466,6 +5420,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
                if (parent)
                        parent->vmstats.events_pending[i] += delta;
        }
+
+       for_each_node_state(nid, N_MEMORY) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
+               struct mem_cgroup_per_node *ppn = NULL;
+               struct lruvec_stats_percpu *lstatc;
+
+               if (parent)
+                       ppn = parent->nodeinfo[nid];
+
+               lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
+
+               for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
+                       delta = pn->lruvec_stats.state_pending[i];
+                       if (delta)
+                               pn->lruvec_stats.state_pending[i] = 0;
+
+                       v = READ_ONCE(lstatc->state[i]);
+                       if (v != lstatc->state_prev[i]) {
+                               delta += v - lstatc->state_prev[i];
+                               lstatc->state_prev[i] = v;
+                       }
+
+                       if (!delta)
+                               continue;
+
+                       pn->lruvec_stats.state[i] += delta;
+                       if (ppn)
+                               ppn->lruvec_stats.state_pending[i] += delta;
+               }
+       }
 }
 
 #ifdef CONFIG_MMU
@@ -6399,6 +6383,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
        int i;
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
+       cgroup_rstat_flush(memcg->css.cgroup);
+
        for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
                int nid;
 
@@ -6704,8 +6690,7 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                        atomic_long_read(&parent->memory.children_low_usage)));
 }
 
-static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
-                              gfp_t gfp)
+static int charge_memcg(struct page *page, struct mem_cgroup *memcg, gfp_t gfp)
 {
        unsigned int nr_pages = thp_nr_pages(page);
        int ret;
@@ -6726,7 +6711,7 @@ out:
 }
 
 /**
- * mem_cgroup_charge - charge a newly allocated page to a cgroup
+ * __mem_cgroup_charge - charge a newly allocated page to a cgroup
  * @page: page to charge
  * @mm: mm context of the victim
  * @gfp_mask: reclaim mode
@@ -6739,16 +6724,14 @@ out:
  *
  * Returns 0 on success. Otherwise, an error code is returned.
  */
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
+int __mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+                       gfp_t gfp_mask)
 {
        struct mem_cgroup *memcg;
        int ret;
 
-       if (mem_cgroup_disabled())
-               return 0;
-
        memcg = get_mem_cgroup_from_mm(mm);
-       ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+       ret = charge_memcg(page, memcg, gfp_mask);
        css_put(&memcg->css);
 
        return ret;
@@ -6783,7 +6766,7 @@ int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
                memcg = get_mem_cgroup_from_mm(mm);
        rcu_read_unlock();
 
-       ret = __mem_cgroup_charge(page, memcg, gfp);
+       ret = charge_memcg(page, memcg, gfp);
 
        css_put(&memcg->css);
        return ret;
@@ -6919,18 +6902,15 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
 }
 
 /**
- * mem_cgroup_uncharge - uncharge a page
+ * __mem_cgroup_uncharge - uncharge a page
  * @page: page to uncharge
  *
- * Uncharge a page previously charged with mem_cgroup_charge().
+ * Uncharge a page previously charged with __mem_cgroup_charge().
  */
-void mem_cgroup_uncharge(struct page *page)
+void __mem_cgroup_uncharge(struct page *page)
 {
        struct uncharge_gather ug;
 
-       if (mem_cgroup_disabled())
-               return;
-
        /* Don't touch page->lru of any random page, pre-check: */
        if (!page_memcg(page))
                return;
@@ -6941,20 +6921,17 @@ void mem_cgroup_uncharge(struct page *page)
 }
 
 /**
- * mem_cgroup_uncharge_list - uncharge a list of page
+ * __mem_cgroup_uncharge_list - uncharge a list of page
  * @page_list: list of pages to uncharge
  *
  * Uncharge a list of pages previously charged with
- * mem_cgroup_charge().
+ * __mem_cgroup_charge().
  */
-void mem_cgroup_uncharge_list(struct list_head *page_list)
+void __mem_cgroup_uncharge_list(struct list_head *page_list)
 {
        struct uncharge_gather ug;
        struct page *page;
 
-       if (mem_cgroup_disabled())
-               return;
-
        uncharge_gather_clear(&ug);
        list_for_each_entry(page, page_list, lru)
                uncharge_page(page, &ug);
@@ -7244,7 +7221,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
 }
 
 /**
- * mem_cgroup_try_charge_swap - try charging swap space for a page
+ * __mem_cgroup_try_charge_swap - try charging swap space for a page
  * @page: page being added to swap
  * @entry: swap entry to charge
  *
@@ -7252,16 +7229,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  *
  * Returns 0 on success, -ENOMEM on failure.
  */
-int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
+int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 {
        unsigned int nr_pages = thp_nr_pages(page);
        struct page_counter *counter;
        struct mem_cgroup *memcg;
        unsigned short oldid;
 
-       if (mem_cgroup_disabled())
-               return 0;
-
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
                return 0;
 
@@ -7297,11 +7271,11 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 }
 
 /**
- * mem_cgroup_uncharge_swap - uncharge swap space
+ * __mem_cgroup_uncharge_swap - uncharge swap space
  * @entry: swap entry to uncharge
  * @nr_pages: the amount of swap space to uncharge
  */
-void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
+void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
 {
        struct mem_cgroup *memcg;
        unsigned short id;