Merge tag 'amd-drm-fixes-5.9-2020-08-20' of git://people.freedesktop.org/~agd5f/linux...
[linux-2.6-microblaze.git] / mm / memcontrol.c
index a3e9633..b807952 100644 (file)
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
 
 struct mem_cgroup *root_mem_cgroup __read_mostly;
 
-#define MEM_CGROUP_RECLAIM_RETRIES     5
-
 /* Socket memory accounting disabled? */
 static bool cgroup_memory_nosocket;
 
@@ -783,7 +781,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
        if (mem_cgroup_disabled())
                return;
 
-       if (vmstat_item_in_bytes(idx))
+       if (memcg_stat_item_in_bytes(idx))
                threshold <<= PAGE_SHIFT;
 
        x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
@@ -1490,6 +1488,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
        seq_buf_printf(&s, "slab %llu\n",
                       (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
                             memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+       seq_buf_printf(&s, "percpu %llu\n",
+                      (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
        seq_buf_printf(&s, "sock %llu\n",
                       (u64)memcg_page_state(memcg, MEMCG_SOCK) *
                       PAGE_SIZE);
@@ -1530,12 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
        seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
                       memcg_events(memcg, PGMAJFAULT));
 
-       seq_buf_printf(&s, "workingset_refault %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_REFAULT));
-       seq_buf_printf(&s, "workingset_activate %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+       seq_buf_printf(&s, "workingset_refault_anon %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
+       seq_buf_printf(&s, "workingset_refault_file %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
+       seq_buf_printf(&s, "workingset_activate_anon %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
+       seq_buf_printf(&s, "workingset_activate_file %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
        seq_buf_printf(&s, "workingset_restore %lu\n",
-                      memcg_page_state(memcg, WORKINGSET_RESTORE));
+                      memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
+       seq_buf_printf(&s, "workingset_restore %lu\n",
+                      memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
        seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
                       memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
 
@@ -1665,15 +1671,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
                .gfp_mask = gfp_mask,
                .order = order,
        };
-       bool ret;
+       bool ret = true;
 
        if (mutex_lock_killable(&oom_lock))
                return true;
+
+       if (mem_cgroup_margin(memcg) >= (1 << order))
+               goto unlock;
+
        /*
         * A few threads which were not waiting at mutex_lock_killable() can
         * fail to bail out. Therefore, check again after holding oom_lock.
         */
        ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
        mutex_unlock(&oom_lock);
        return ret;
 }
@@ -2363,18 +2375,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
        return 0;
 }
 
-static void reclaim_high(struct mem_cgroup *memcg,
-                        unsigned int nr_pages,
-                        gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+                                 unsigned int nr_pages,
+                                 gfp_t gfp_mask)
 {
+       unsigned long nr_reclaimed = 0;
+
        do {
+               unsigned long pflags;
+
                if (page_counter_read(&memcg->memory) <=
                    READ_ONCE(memcg->memory.high))
                        continue;
+
                memcg_memory_event(memcg, MEMCG_HIGH);
-               try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+               psi_memstall_enter(&pflags);
+               nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+                                                            gfp_mask, true);
+               psi_memstall_leave(&pflags);
        } while ((memcg = parent_mem_cgroup(memcg)) &&
                 !mem_cgroup_is_root(memcg));
+
+       return nr_reclaimed;
 }
 
 static void high_work_func(struct work_struct *work)
@@ -2399,7 +2422,7 @@ static void high_work_func(struct work_struct *work)
  *
  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
  *   overage ratio to a delay.
- * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
  *   to produce a reasonable delay curve.
  *
@@ -2530,16 +2553,32 @@ void mem_cgroup_handle_over_high(void)
 {
        unsigned long penalty_jiffies;
        unsigned long pflags;
+       unsigned long nr_reclaimed;
        unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *memcg;
+       bool in_retry = false;
 
        if (likely(!nr_pages))
                return;
 
        memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
        current->memcg_nr_pages_over_high = 0;
 
+retry_reclaim:
+       /*
+        * The allocating task should reclaim at least the batch size, but for
+        * subsequent retries we only want to do what's necessary to prevent oom
+        * or breaching resource isolation.
+        *
+        * This is distinct from memory.max or page allocator behaviour because
+        * memory.high is currently batched, whereas memory.max and the page
+        * allocator run every time an allocation is made.
+        */
+       nr_reclaimed = reclaim_high(memcg,
+                                   in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+                                   GFP_KERNEL);
+
        /*
         * memory.high is breached and reclaim is unable to keep up. Throttle
         * allocators proactively to slow down excessive growth.
@@ -2566,6 +2605,16 @@ void mem_cgroup_handle_over_high(void)
        if (penalty_jiffies <= HZ / 100)
                goto out;
 
+       /*
+        * If reclaim is making forward progress but we're still over
+        * memory.high, we want to encourage that rather than doing allocator
+        * throttling.
+        */
+       if (nr_reclaimed || nr_retries--) {
+               in_retry = true;
+               goto retry_reclaim;
+       }
+
        /*
         * If we exit early, we're guaranteed to die (since
         * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
@@ -2583,13 +2632,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
                      unsigned int nr_pages)
 {
        unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
        struct mem_cgroup *mem_over_limit;
        struct page_counter *counter;
+       enum oom_status oom_status;
        unsigned long nr_reclaimed;
        bool may_swap = true;
        bool drained = false;
-       enum oom_status oom_status;
+       unsigned long pflags;
 
        if (mem_cgroup_is_root(memcg))
                return 0;
@@ -2649,8 +2699,10 @@ retry:
 
        memcg_memory_event(mem_over_limit, MEMCG_MAX);
 
+       psi_memstall_enter(&pflags);
        nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
                                                    gfp_mask, may_swap);
+       psi_memstall_leave(&pflags);
 
        if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
                goto retry;
@@ -2702,7 +2754,7 @@ retry:
                       get_order(nr_pages * PAGE_SIZE));
        switch (oom_status) {
        case OOM_SUCCESS:
-               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+               nr_retries = MAX_RECLAIM_RETRIES;
                goto retry;
        case OOM_FAILED:
                goto force;
@@ -3382,7 +3434,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
  */
 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 {
-       int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       int nr_retries = MAX_RECLAIM_RETRIES;
 
        /* we call try-to-free pages for make this cgroup empty */
        lru_add_drain_all();
@@ -5085,13 +5137,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
        if (!pn)
                return 1;
 
-       pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+       pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+                                                GFP_KERNEL_ACCOUNT);
        if (!pn->lruvec_stat_local) {
                kfree(pn);
                return 1;
        }
 
-       pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+       pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+                                              GFP_KERNEL_ACCOUNT);
        if (!pn->lruvec_stat_cpu) {
                free_percpu(pn->lruvec_stat_local);
                kfree(pn);
@@ -5165,11 +5219,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
                goto fail;
        }
 
-       memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+       memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+                                               GFP_KERNEL_ACCOUNT);
        if (!memcg->vmstats_local)
                goto fail;
 
-       memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+       memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+                                                GFP_KERNEL_ACCOUNT);
        if (!memcg->vmstats_percpu)
                goto fail;
 
@@ -5218,7 +5274,9 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
        struct mem_cgroup *memcg;
        long error = -ENOMEM;
 
+       memalloc_use_memcg(parent);
        memcg = mem_cgroup_alloc();
+       memalloc_unuse_memcg();
        if (IS_ERR(memcg))
                return ERR_CAST(memcg);
 
@@ -5531,7 +5589,7 @@ static int mem_cgroup_move_account(struct page *page,
 {
        struct lruvec *from_vec, *to_vec;
        struct pglist_data *pgdat;
-       unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+       unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
        int ret;
 
        VM_BUG_ON(from == to);
@@ -6203,7 +6261,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                                 char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_retries = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long high;
        int err;
@@ -6213,8 +6271,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
        if (err)
                return err;
 
-       page_counter_set_high(&memcg->memory, high);
-
        for (;;) {
                unsigned long nr_pages = page_counter_read(&memcg->memory);
                unsigned long reclaimed;
@@ -6238,6 +6294,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                        break;
        }
 
+       page_counter_set_high(&memcg->memory, high);
+
+       memcg_wb_domain_size_changed(memcg);
+
        return nbytes;
 }
 
@@ -6251,7 +6311,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
                                char *buf, size_t nbytes, loff_t off)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
-       unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+       unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
        bool drained = false;
        unsigned long max;
        int err;
@@ -6558,40 +6618,42 @@ static unsigned long effective_protection(unsigned long usage,
  *
  * WARNING: This function is not stateless! It can only be used as part
  *          of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- *   MEMCG_PROT_NONE: cgroup memory is not protected
- *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
- *     an unprotected supply of reclaimable memory from other cgroups.
- *   MEMCG_PROT_MIN: cgroup memory is protected
  */
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
-                                               struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+                                    struct mem_cgroup *memcg)
 {
        unsigned long usage, parent_usage;
        struct mem_cgroup *parent;
 
        if (mem_cgroup_disabled())
-               return MEMCG_PROT_NONE;
+               return;
 
        if (!root)
                root = root_mem_cgroup;
+
+       /*
+        * Effective values of the reclaim targets are ignored so they
+        * can be stale. Have a look at mem_cgroup_protection for more
+        * details.
+        * TODO: calculation should be more robust so that we do not need
+        * that special casing.
+        */
        if (memcg == root)
-               return MEMCG_PROT_NONE;
+               return;
 
        usage = page_counter_read(&memcg->memory);
        if (!usage)
-               return MEMCG_PROT_NONE;
+               return;
 
        parent = parent_mem_cgroup(memcg);
        /* No parent means a non-hierarchical mode on v1 memcg */
        if (!parent)
-               return MEMCG_PROT_NONE;
+               return;
 
        if (parent == root) {
                memcg->memory.emin = READ_ONCE(memcg->memory.min);
                memcg->memory.elow = READ_ONCE(memcg->memory.low);
-               goto out;
+               return;
        }
 
        parent_usage = page_counter_read(&parent->memory);
@@ -6605,14 +6667,6 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
                        READ_ONCE(memcg->memory.low),
                        READ_ONCE(parent->memory.elow),
                        atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
-       if (usage <= memcg->memory.emin)
-               return MEMCG_PROT_MIN;
-       else if (usage <= memcg->memory.elow)
-               return MEMCG_PROT_LOW;
-       else
-               return MEMCG_PROT_NONE;
 }
 
 /**
@@ -6628,7 +6682,7 @@ out:
  */
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
-       unsigned int nr_pages = hpage_nr_pages(page);
+       unsigned int nr_pages = thp_nr_pages(page);
        struct mem_cgroup *memcg = NULL;
        int ret = 0;
 
@@ -6858,7 +6912,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
                return;
 
        /* Force-charge the new page. The old one will be freed soon */
-       nr_pages = hpage_nr_pages(newpage);
+       nr_pages = thp_nr_pages(newpage);
 
        page_counter_charge(&memcg->memory, nr_pages);
        if (do_memsw_account())
@@ -7060,7 +7114,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
         * ancestor for the swap instead and transfer the memory+swap charge.
         */
        swap_memcg = mem_cgroup_id_get_online(memcg);
-       nr_entries = hpage_nr_pages(page);
+       nr_entries = thp_nr_pages(page);
        /* Get references for the tail pages, too */
        if (nr_entries > 1)
                mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
@@ -7104,7 +7158,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
  */
 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
 {
-       unsigned int nr_pages = hpage_nr_pages(page);
+       unsigned int nr_pages = thp_nr_pages(page);
        struct page_counter *counter;
        struct mem_cgroup *memcg;
        unsigned short oldid;