mm/memory_hotplug: enforce section granularity when onlining/offlining
[linux-2.6-microblaze.git] / mm / memcontrol.c
index 5c1983c..7f74a15 100644 (file)
@@ -197,14 +197,6 @@ static struct move_charge_struct {
 #define        MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 #define        MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 
-enum charge_type {
-       MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
-       MEM_CGROUP_CHARGE_TYPE_ANON,
-       MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
-       MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
-       NR_CHARGE_TYPE,
-};
-
 /* for encoding cft->private value on file */
 enum res_type {
        _MEM,
@@ -1102,9 +1094,9 @@ static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
  * invocations for reference counting, or use mem_cgroup_iter_break()
  * to cancel a hierarchy walk before the round-trip is complete.
  *
- * Reclaimers can specify a node and a priority level in @reclaim to
- * divide up the memcgs in the hierarchy among all concurrent
- * reclaimers operating on the same node and priority.
+ * Reclaimers can specify a node in @reclaim to divide up the memcgs
+ * in the hierarchy among all concurrent reclaimers operating on the
+ * same node.
  */
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
                                   struct mem_cgroup *prev,
@@ -1456,6 +1448,70 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
        return false;
 }
 
+struct memory_stat {
+       const char *name;
+       unsigned int ratio;
+       unsigned int idx;
+};
+
+static struct memory_stat memory_stats[] = {
+       { "anon", PAGE_SIZE, NR_ANON_MAPPED },
+       { "file", PAGE_SIZE, NR_FILE_PAGES },
+       { "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+       { "percpu", 1, MEMCG_PERCPU_B },
+       { "sock", PAGE_SIZE, MEMCG_SOCK },
+       { "shmem", PAGE_SIZE, NR_SHMEM },
+       { "file_mapped", PAGE_SIZE, NR_FILE_MAPPED },
+       { "file_dirty", PAGE_SIZE, NR_FILE_DIRTY },
+       { "file_writeback", PAGE_SIZE, NR_WRITEBACK },
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /*
+        * The ratio will be initialized in memory_stats_init(). Because
+        * on some architectures, the macro of HPAGE_PMD_SIZE is not
+        * constant(e.g. powerpc).
+        */
+       { "anon_thp", 0, NR_ANON_THPS },
+#endif
+       { "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
+       { "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
+       { "inactive_file", PAGE_SIZE, NR_INACTIVE_FILE },
+       { "active_file", PAGE_SIZE, NR_ACTIVE_FILE },
+       { "unevictable", PAGE_SIZE, NR_UNEVICTABLE },
+
+       /*
+        * Note: The slab_reclaimable and slab_unreclaimable must be
+        * together and slab_reclaimable must be in front.
+        */
+       { "slab_reclaimable", 1, NR_SLAB_RECLAIMABLE_B },
+       { "slab_unreclaimable", 1, NR_SLAB_UNRECLAIMABLE_B },
+
+       /* The memory events */
+       { "workingset_refault_anon", 1, WORKINGSET_REFAULT_ANON },
+       { "workingset_refault_file", 1, WORKINGSET_REFAULT_FILE },
+       { "workingset_activate_anon", 1, WORKINGSET_ACTIVATE_ANON },
+       { "workingset_activate_file", 1, WORKINGSET_ACTIVATE_FILE },
+       { "workingset_restore_anon", 1, WORKINGSET_RESTORE_ANON },
+       { "workingset_restore_file", 1, WORKINGSET_RESTORE_FILE },
+       { "workingset_nodereclaim", 1, WORKINGSET_NODERECLAIM },
+};
+
+static int __init memory_stats_init(void)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+               if (memory_stats[i].idx == NR_ANON_THPS)
+                       memory_stats[i].ratio = HPAGE_PMD_SIZE;
+#endif
+               VM_BUG_ON(!memory_stats[i].ratio);
+               VM_BUG_ON(memory_stats[i].idx >= MEMCG_NR_STAT);
+       }
+
+       return 0;
+}
+pure_initcall(memory_stats_init);
+
 static char *memory_stat_format(struct mem_cgroup *memcg)
 {
        struct seq_buf s;
@@ -1476,52 +1532,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
         * Current memory state:
         */
 
-       seq_buf_printf(&s, "anon %llu\n",
-                      (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
-                      PAGE_SIZE);
-       seq_buf_printf(&s, "file %llu\n",
-                      (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
-                      PAGE_SIZE);
-       seq_buf_printf(&s, "kernel_stack %llu\n",
-                      (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
-                      1024);
-       seq_buf_printf(&s, "slab %llu\n",
-                      (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
-                            memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
-       seq_buf_printf(&s, "percpu %llu\n",
-                      (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
-       seq_buf_printf(&s, "sock %llu\n",
-                      (u64)memcg_page_state(memcg, MEMCG_SOCK) *
-                      PAGE_SIZE);
-
-       seq_buf_printf(&s, "shmem %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SHMEM) *
-                      PAGE_SIZE);
-       seq_buf_printf(&s, "file_mapped %llu\n",
-                      (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
-                      PAGE_SIZE);
-       seq_buf_printf(&s, "file_dirty %llu\n",
-                      (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
-                      PAGE_SIZE);
-       seq_buf_printf(&s, "file_writeback %llu\n",
-                      (u64)memcg_page_state(memcg, NR_WRITEBACK) *
-                      PAGE_SIZE);
+       for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+               u64 size;
 
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       seq_buf_printf(&s, "anon_thp %llu\n",
-                      (u64)memcg_page_state(memcg, NR_ANON_THPS) *
-                      HPAGE_PMD_SIZE);
-#endif
-
-       for (i = 0; i < NR_LRU_LISTS; i++)
-               seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
-                              (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
-                              PAGE_SIZE);
+               size = memcg_page_state(memcg, memory_stats[i].idx);
+               size *= memory_stats[i].ratio;
+               seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
 
-       seq_buf_printf(&s, "slab_reclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
-       seq_buf_printf(&s, "slab_unreclaimable %llu\n",
-                      (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
+               if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
+                       size = memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+                              memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B);
+                       seq_buf_printf(&s, "slab %llu\n", size);
+               }
+       }
 
        /* Accumulated memory events */
 
@@ -1529,22 +1552,6 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                       memcg_events(memcg, PGFAULT));
        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
                       memcg_events(memcg, PGMAJFAULT));
-
-       seq_buf_printf(&s, "workingset_refault_anon %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
-       seq_buf_printf(&s, "workingset_refault_file %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
-       seq_buf_printf(&s, "workingset_activate_anon %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
-       seq_buf_printf(&s, "workingset_activate_file %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
-       seq_buf_printf(&s, "workingset_restore_anon %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
-       seq_buf_printf(&s, "workingset_restore_file %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
-       seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
-
        seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
                       memcg_events(memcg, PGREFILL));
        seq_buf_printf(&s, "pgscan %lu\n",
@@ -1641,17 +1648,19 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
  */
 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
 {
-       unsigned long max;
+       unsigned long max = READ_ONCE(memcg->memory.max);
 
-       max = READ_ONCE(memcg->memory.max);
-       if (mem_cgroup_swappiness(memcg)) {
-               unsigned long memsw_max;
-               unsigned long swap_max;
+       if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
+               if (mem_cgroup_swappiness(memcg))
+                       max += min(READ_ONCE(memcg->swap.max),
+                                  (unsigned long)total_swap_pages);
+       } else { /* v1 */
+               if (mem_cgroup_swappiness(memcg)) {
+                       /* Calculate swap excess capacity from memsw limit */
+                       unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
 
-               memsw_max = memcg->memsw.max;
-               swap_max = READ_ONCE(memcg->swap.max);
-               swap_max = min(swap_max, (unsigned long)total_swap_pages);
-               max = min(max + swap_max, memsw_max);
+                       max += min(swap, (unsigned long)total_swap_pages);
+               }
        }
        return max;
 }
@@ -1817,8 +1826,8 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
        struct mem_cgroup *iter;
 
        /*
-        * When a new child is created while the hierarchy is under oom,
-        * mem_cgroup_oom_lock() may not be called. Watch for underflow.
+        * Be careful about under_oom underflows becase a child memcg
+        * could have been added after mem_cgroup_mark_under_oom.
         */
        spin_lock(&memcg_oom_lock);
        for_each_mem_cgroup_tree(iter, memcg)
@@ -2887,6 +2896,17 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
 
        page = virt_to_head_page(p);
 
+       /*
+        * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
+        * or a pointer to obj_cgroup vector. In the latter case the lowest
+        * bit of the pointer is set.
+        * The page->mem_cgroup pointer can be asynchronously changed
+        * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
+        * from a valid memcg pointer to objcg vector or back.
+        */
+       if (!page->mem_cgroup)
+               return NULL;
+
        /*
         * Slab objects are accounted individually, not per-page.
         * Memcg membership data for each individual object is saved in
@@ -4255,17 +4275,16 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
        new->size = size;
 
        /* Copy thresholds (if any) to new array */
-       if (thresholds->primary) {
-               memcpy(new->entries, thresholds->primary->entries, (size - 1) *
-                               sizeof(struct mem_cgroup_threshold));
-       }
+       if (thresholds->primary)
+               memcpy(new->entries, thresholds->primary->entries,
+                      flex_array_size(new, entries, size - 1));
 
        /* Add new threshold */
        new->entries[size - 1].eventfd = eventfd;
        new->entries[size - 1].threshold = threshold;
 
        /* Sort thresholds. Registering of new threshold isn't time-critical */
-       sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
+       sort(new->entries, size, sizeof(*new->entries),
                        compare_thresholds, NULL);
 
        /* Find current threshold */
@@ -5291,13 +5310,11 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
                memcg->use_hierarchy = true;
                page_counter_init(&memcg->memory, &parent->memory);
                page_counter_init(&memcg->swap, &parent->swap);
-               page_counter_init(&memcg->memsw, &parent->memsw);
                page_counter_init(&memcg->kmem, &parent->kmem);
                page_counter_init(&memcg->tcpmem, &parent->tcpmem);
        } else {
                page_counter_init(&memcg->memory, NULL);
                page_counter_init(&memcg->swap, NULL);
-               page_counter_init(&memcg->memsw, NULL);
                page_counter_init(&memcg->kmem, NULL);
                page_counter_init(&memcg->tcpmem, NULL);
                /*
@@ -5426,7 +5443,6 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
 
        page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
        page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
-       page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
        page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
        page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
        page_counter_set_min(&memcg->memory, 0);
@@ -5500,7 +5516,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
        struct page *page = NULL;
        swp_entry_t ent = pte_to_swp_entry(ptent);
 
-       if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+       if (!(mc.flags & MOVE_ANON))
                return NULL;
 
        /*
@@ -5519,6 +5535,9 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
                return page;
        }
 
+       if (non_swap_entry(ent))
+               return NULL;
+
        /*
         * Because lookup_swap_cache() updates some statistics counter,
         * we call find_get_page() with swapper_space directly.
@@ -5539,35 +5558,15 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
                        unsigned long addr, pte_t ptent, swp_entry_t *entry)
 {
-       struct page *page = NULL;
-       struct address_space *mapping;
-       pgoff_t pgoff;
-
        if (!vma->vm_file) /* anonymous vma */
                return NULL;
        if (!(mc.flags & MOVE_FILE))
                return NULL;
 
-       mapping = vma->vm_file->f_mapping;
-       pgoff = linear_page_index(vma, addr);
-
        /* page is moved even if it's not RSS of this task(page-faulted). */
-#ifdef CONFIG_SWAP
        /* shmem/tmpfs may report page out on swap: account for that too. */
-       if (shmem_mapping(mapping)) {
-               page = find_get_entry(mapping, pgoff);
-               if (xa_is_value(page)) {
-                       swp_entry_t swp = radix_to_swp_entry(page);
-                       *entry = swp;
-                       page = find_get_page(swap_address_space(swp),
-                                            swp_offset(swp));
-               }
-       } else
-               page = find_get_page(mapping, pgoff);
-#else
-       page = find_get_page(mapping, pgoff);
-#endif
-       return page;
+       return find_get_incore_page(vma->vm_file->f_mapping,
+                       linear_page_index(vma, addr));
 }
 
 /**
@@ -6393,6 +6392,35 @@ static int memory_stat_show(struct seq_file *m, void *v)
        return 0;
 }
 
+#ifdef CONFIG_NUMA
+static int memory_numa_stat_show(struct seq_file *m, void *v)
+{
+       int i;
+       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+       for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+               int nid;
+
+               if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
+                       continue;
+
+               seq_printf(m, "%s", memory_stats[i].name);
+               for_each_node_state(nid, N_MEMORY) {
+                       u64 size;
+                       struct lruvec *lruvec;
+
+                       lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
+                       size = lruvec_page_state(lruvec, memory_stats[i].idx);
+                       size *= memory_stats[i].ratio;
+                       seq_printf(m, " N%d=%llu", nid, size);
+               }
+               seq_putc(m, '\n');
+       }
+
+       return 0;
+}
+#endif
+
 static int memory_oom_group_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
@@ -6470,6 +6498,12 @@ static struct cftype memory_files[] = {
                .name = "stat",
                .seq_show = memory_stat_show,
        },
+#ifdef CONFIG_NUMA
+       {
+               .name = "numa_stat",
+               .seq_show = memory_numa_stat_show,
+       },
+#endif
        {
                .name = "oom.group",
                .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,