Merge tag 'vfs-5.8-merge-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux
[linux-2.6-microblaze.git] / mm / memcontrol.c
index a3b97f1..f973a02 100644 (file)
@@ -1314,7 +1314,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
        if (do_memsw_account()) {
                count = page_counter_read(&memcg->memsw);
                limit = READ_ONCE(memcg->memsw.max);
-               if (count <= limit)
+               if (count < limit)
                        margin = min(margin, limit - count);
                else
                        margin = 0;
@@ -1451,6 +1451,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
                       memcg_page_state(memcg, WORKINGSET_REFAULT));
        seq_buf_printf(&s, "workingset_activate %lu\n",
                       memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+       seq_buf_printf(&s, "workingset_restore %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_RESTORE));
        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
                       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
 
@@ -2250,7 +2252,8 @@ static void reclaim_high(struct mem_cgroup *memcg,
                         gfp_t gfp_mask)
 {
        do {
-               if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
+               if (page_counter_read(&memcg->memory) <=
+                   READ_ONCE(memcg->memory.high))
                        continue;
                memcg_memory_event(memcg, MEMCG_HIGH);
                try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
@@ -2319,41 +2322,64 @@ static void high_work_func(struct work_struct *work)
  #define MEMCG_DELAY_PRECISION_SHIFT 20
  #define MEMCG_DELAY_SCALING_SHIFT 14
 
-/*
- * Get the number of jiffies that we should penalise a mischievous cgroup which
- * is exceeding its memory.high by checking both it and its ancestors.
- */
-static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
-                                         unsigned int nr_pages)
+static u64 calculate_overage(unsigned long usage, unsigned long high)
 {
-       unsigned long penalty_jiffies;
-       u64 max_overage = 0;
+       u64 overage;
 
-       do {
-               unsigned long usage, high;
-               u64 overage;
+       if (usage <= high)
+               return 0;
 
-               usage = page_counter_read(&memcg->memory);
-               high = READ_ONCE(memcg->high);
+       /*
+        * Prevent division by 0 in overage calculation by acting as if
+        * it was a threshold of 1 page
+        */
+       high = max(high, 1UL);
 
-               if (usage <= high)
-                       continue;
+       overage = usage - high;
+       overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+       return div64_u64(overage, high);
+}
 
-               /*
-                * Prevent division by 0 in overage calculation by acting as if
-                * it was a threshold of 1 page
-                */
-               high = max(high, 1UL);
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+       u64 overage, max_overage = 0;
+
+       do {
+               overage = calculate_overage(page_counter_read(&memcg->memory),
+                                           READ_ONCE(memcg->memory.high));
+               max_overage = max(overage, max_overage);
+       } while ((memcg = parent_mem_cgroup(memcg)) &&
+                !mem_cgroup_is_root(memcg));
 
-               overage = usage - high;
-               overage <<= MEMCG_DELAY_PRECISION_SHIFT;
-               overage = div64_u64(overage, high);
+       return max_overage;
+}
 
-               if (overage > max_overage)
-                       max_overage = overage;
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+       u64 overage, max_overage = 0;
+
+       do {
+               overage = calculate_overage(page_counter_read(&memcg->swap),
+                                           READ_ONCE(memcg->swap.high));
+               if (overage)
+                       memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+               max_overage = max(overage, max_overage);
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
 
+       return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+                                         unsigned int nr_pages,
+                                         u64 max_overage)
+{
+       unsigned long penalty_jiffies;
+
        if (!max_overage)
                return 0;
 
@@ -2377,14 +2403,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
         * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
         * larger the current charge patch is than that.
         */
-       penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
-
-       /*
-        * Clamp the max delay per usermode return so as to still keep the
-        * application moving forwards and also permit diagnostics, albeit
-        * extremely slowly.
-        */
-       return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+       return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
 }
 
 /*
@@ -2409,7 +2428,18 @@ void mem_cgroup_handle_over_high(void)
         * memory.high is breached and reclaim is unable to keep up. Throttle
         * allocators proactively to slow down excessive growth.
         */
-       penalty_jiffies = calculate_high_delay(memcg, nr_pages);
+       penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+                                              mem_find_max_overage(memcg));
+
+       penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+                                               swap_find_max_overage(memcg));
+
+       /*
+        * Clamp the max delay per usermode return so as to still keep the
+        * application moving forwards and also permit diagnostics, albeit
+        * extremely slowly.
+        */
+       penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
 
        /*
         * Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2594,12 +2624,32 @@ done_restock:
         * reclaim, the cost of mismatch is negligible.
         */
        do {
-               if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
-                       /* Don't bother a random interrupted task */
-                       if (in_interrupt()) {
+               bool mem_high, swap_high;
+
+               mem_high = page_counter_read(&memcg->memory) >
+                       READ_ONCE(memcg->memory.high);
+               swap_high = page_counter_read(&memcg->swap) >
+                       READ_ONCE(memcg->swap.high);
+
+               /* Don't bother a random interrupted task */
+               if (in_interrupt()) {
+                       if (mem_high) {
                                schedule_work(&memcg->high_work);
                                break;
                        }
+                       continue;
+               }
+
+               if (mem_high || swap_high) {
+                       /*
+                        * The allocating tasks in this cgroup will need to do
+                        * reclaim or be throttled to prevent further growth
+                        * of the memory or swap footprints.
+                        *
+                        * Target some best-effort fairness between the tasks,
+                        * and distribute reclaim work and delay penalties
+                        * based on how much each task is actually allocating.
+                        */
                        current->memcg_nr_pages_over_high += batch;
                        set_notify_resume(current);
                        break;
@@ -2802,7 +2852,12 @@ static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
 
 static inline bool memcg_kmem_bypass(void)
 {
-       if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+       if (in_interrupt())
+               return true;
+
+       /* Allow remote memcg charging in kthread contexts. */
+       if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+            !current->active_memcg)
                return true;
        return false;
 }
@@ -4330,7 +4385,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 
        *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
 
-       /* this should eventually include NR_UNSTABLE_NFS */
        *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
        *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
                        memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
@@ -4338,7 +4392,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
 
        while ((parent = parent_mem_cgroup(memcg))) {
                unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
-                                           READ_ONCE(memcg->high));
+                                           READ_ONCE(memcg->memory.high));
                unsigned long used = page_counter_read(&memcg->memory);
 
                *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -5063,8 +5117,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        if (IS_ERR(memcg))
                return ERR_CAST(memcg);
 
-       WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+       page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
        if (parent) {
                memcg->swappiness = mem_cgroup_swappiness(parent);
                memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -5216,8 +5271,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
        page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
        page_counter_set_min(&memcg->memory, 0);
        page_counter_set_low(&memcg->memory, 0);
-       WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+       page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
        memcg->soft_limit = PAGE_COUNTER_MAX;
+       page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
        memcg_wb_domain_size_changed(memcg);
 }
 
@@ -6015,7 +6071,8 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
 
 static int memory_high_show(struct seq_file *m, void *v)
 {
-       return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
+       return seq_puts_memcg_tunable(m,
+               READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
 }
 
 static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -6032,7 +6089,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
-       WRITE_ONCE(memcg->high, high);
+       page_counter_set_high(&memcg->memory, high);
 
        for (;;) {
                unsigned long nr_pages = page_counter_read(&memcg->memory);
@@ -6227,7 +6284,6 @@ static struct cftype memory_files[] = {
        },
        {
                .name = "stat",
-               .flags = CFTYPE_NOT_ON_ROOT,
                .seq_show = memory_stat_show,
        },
        {
@@ -7131,10 +7187,13 @@ bool mem_cgroup_swap_full(struct page *page)
        if (!memcg)
                return false;
 
-       for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
-               if (page_counter_read(&memcg->swap) * 2 >=
-                   READ_ONCE(memcg->swap.max))
+       for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+               unsigned long usage = page_counter_read(&memcg->swap);
+
+               if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+                   usage * 2 >= READ_ONCE(memcg->swap.max))
                        return true;
+       }
 
        return false;
 }
@@ -7164,6 +7223,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
        return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
 }
 
+static int swap_high_show(struct seq_file *m, void *v)
+{
+       return seq_puts_memcg_tunable(m,
+               READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+                              char *buf, size_t nbytes, loff_t off)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+       unsigned long high;
+       int err;
+
+       buf = strstrip(buf);
+       err = page_counter_memparse(buf, "max", &high);
+       if (err)
+               return err;
+
+       page_counter_set_high(&memcg->swap, high);
+
+       return nbytes;
+}
+
 static int swap_max_show(struct seq_file *m, void *v)
 {
        return seq_puts_memcg_tunable(m,
@@ -7191,6 +7273,8 @@ static int swap_events_show(struct seq_file *m, void *v)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
 
+       seq_printf(m, "high %lu\n",
+                  atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
        seq_printf(m, "max %lu\n",
                   atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
        seq_printf(m, "fail %lu\n",
@@ -7205,6 +7289,12 @@ static struct cftype swap_files[] = {
                .flags = CFTYPE_NOT_ON_ROOT,
                .read_u64 = swap_current_read,
        },
+       {
+               .name = "swap.high",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .seq_show = swap_high_show,
+               .write = swap_high_write,
+       },
        {
                .name = "swap.max",
                .flags = CFTYPE_NOT_ON_ROOT,