Merge tag 'perf-urgent-2022-08-06' of git://git.kernel.org/pub/scm/linux/kernel/git...

[linux-2.6-microblaze.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 618c366..b69979c 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -626,7 +626,14 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
  
         x = __this_cpu_add_return(stats_updates, abs(val));
         if (x > MEMCG_CHARGE_BATCH) {
-               atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
+               /*
+                * If stats_flush_threshold exceeds the threshold
+                * (>num_online_cpus()), cgroup stats update will be triggered
+                * in __mem_cgroup_flush_stats(). Increasing this var further
+                * is redundant and simply adds overhead in atomic update.
+                */
+               if (atomic_read(&stats_flush_threshold) <= num_online_cpus())
+                       atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
                 __this_cpu_write(stats_updates, 0);
         }
  }
@@ -783,7 +790,7 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
         struct lruvec *lruvec;
  
         rcu_read_lock();
-       memcg = mem_cgroup_from_obj(p);
+       memcg = mem_cgroup_from_slab_obj(p);
  
         /*
          * Untracked pages have no memcg, no lruvec. Update only the
@@ -1460,14 +1467,35 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
         return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
  }
  
-static char *memory_stat_format(struct mem_cgroup *memcg)
+/* Subset of vm_event_item to report for memcg event stats */
+static const unsigned int memcg_vm_event_stat[] = {
+       PGSCAN_KSWAPD,
+       PGSCAN_DIRECT,
+       PGSTEAL_KSWAPD,
+       PGSTEAL_DIRECT,
+       PGFAULT,
+       PGMAJFAULT,
+       PGREFILL,
+       PGACTIVATE,
+       PGDEACTIVATE,
+       PGLAZYFREE,
+       PGLAZYFREED,
+#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
+       ZSWPIN,
+       ZSWPOUT,
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       THP_FAULT_ALLOC,
+       THP_COLLAPSE_ALLOC,
+#endif
+};
+
+static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
  {
         struct seq_buf s;
         int i;
  
-       seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
-       if (!s.buffer)
-               return NULL;
+       seq_buf_init(&s, buf, bufsize);
  
         /*
          * Provide statistics on the state of the memory subsystem as
@@ -1495,46 +1523,20 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
         }
  
         /* Accumulated memory events */
-
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
-                      memcg_events(memcg, PGFAULT));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
-                      memcg_events(memcg, PGMAJFAULT));
-       seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
-                      memcg_events(memcg, PGREFILL));
         seq_buf_printf(&s, "pgscan %lu\n",
                        memcg_events(memcg, PGSCAN_KSWAPD) +
                        memcg_events(memcg, PGSCAN_DIRECT));
         seq_buf_printf(&s, "pgsteal %lu\n",
                        memcg_events(memcg, PGSTEAL_KSWAPD) +
                        memcg_events(memcg, PGSTEAL_DIRECT));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
-                      memcg_events(memcg, PGACTIVATE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
-                      memcg_events(memcg, PGDEACTIVATE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
-                      memcg_events(memcg, PGLAZYFREE));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
-                      memcg_events(memcg, PGLAZYFREED));
-
-#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP)
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPIN),
-                      memcg_events(memcg, ZSWPIN));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(ZSWPOUT),
-                      memcg_events(memcg, ZSWPOUT));
-#endif
  
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
-                      memcg_events(memcg, THP_FAULT_ALLOC));
-       seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
-                      memcg_events(memcg, THP_COLLAPSE_ALLOC));
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+       for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++)
+               seq_buf_printf(&s, "%s %lu\n",
+                              vm_event_name(memcg_vm_event_stat[i]),
+                              memcg_events(memcg, memcg_vm_event_stat[i]));
  
         /* The above should easily fit into one page */
         WARN_ON_ONCE(seq_buf_has_overflowed(&s));
-
-       return s.buffer;
  }
  
  #define K(x) ((x) << (PAGE_SHIFT-10))
@@ -1570,7 +1572,10 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *
   */
  void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
  {
-       char *buf;
+       /* Use static buffer, for the caller is holding oom_lock. */
+       static char buf[PAGE_SIZE];
+
+       lockdep_assert_held(&oom_lock);
  
         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
                 K((u64)page_counter_read(&memcg->memory)),
@@ -1591,11 +1596,8 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
         pr_info("Memory cgroup stats for ");
         pr_cont_cgroup_path(memcg->css.cgroup);
         pr_cont(":");
-       buf = memory_stat_format(memcg);
-       if (!buf)
-               return;
+       memory_stat_format(memcg, buf, sizeof(buf));
         pr_info("%s", buf);
-       kfree(buf);
  }
  
  /*
@@ -2331,7 +2333,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
  
                 psi_memstall_enter(&pflags);
                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
-                                                            gfp_mask, true);
+                                                       gfp_mask,
+                                                       MEMCG_RECLAIM_MAY_SWAP);
                 psi_memstall_leave(&pflags);
         } while ((memcg = parent_mem_cgroup(memcg)) &&
                  !mem_cgroup_is_root(memcg));
@@ -2576,8 +2579,9 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
         struct page_counter *counter;
         unsigned long nr_reclaimed;
         bool passed_oom = false;
-       bool may_swap = true;
+       unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP;
         bool drained = false;
+       bool raised_max_event = false;
         unsigned long pflags;
  
  retry:
@@ -2593,7 +2597,7 @@ retry:
                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
         } else {
                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-               may_swap = false;
+               reclaim_options &= ~MEMCG_RECLAIM_MAY_SWAP;
         }
  
         if (batch > nr_pages) {
@@ -2617,10 +2621,11 @@ retry:
                 goto nomem;
  
         memcg_memory_event(mem_over_limit, MEMCG_MAX);
+       raised_max_event = true;
  
         psi_memstall_enter(&pflags);
         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
-                                                   gfp_mask, may_swap);
+                                                   gfp_mask, reclaim_options);
         psi_memstall_leave(&pflags);
  
         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2683,6 +2688,13 @@ nomem:
         if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
                 return -ENOMEM;
  force:
+       /*
+        * If the allocation has to be enforced, don't forget to raise
+        * a MEMCG_MAX event.
+        */
+       if (!raised_max_event)
+               memcg_memory_event(mem_over_limit, MEMCG_MAX);
+
         /*
          * The allocation either can't fail or will lead to more memory
          * being freed very soon.  Allow memory usage go over the limit
@@ -2842,27 +2854,9 @@ int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
         return 0;
  }
  
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object or a generic kernel page, so
- * different mechanisms for getting the memory cgroup pointer should be used.
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
+static __always_inline
+struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
  {
-       struct folio *folio;
-
-       if (mem_cgroup_disabled())
-               return NULL;
-
-       folio = virt_to_folio(p);
-
         /*
          * Slab objects are accounted individually, not per-page.
          * Memcg membership data for each individual object is saved in
@@ -2895,6 +2889,53 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
         return page_memcg_check(folio_page(folio, 0));
  }
  
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * A passed kernel object can be a slab object, vmalloc object or a generic
+ * kernel page, so different mechanisms for getting the memory cgroup pointer
+ * should be used.
+ *
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+       struct folio *folio;
+
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       if (unlikely(is_vmalloc_addr(p)))
+               folio = page_folio(vmalloc_to_page(p));
+       else
+               folio = virt_to_folio(p);
+
+       return mem_cgroup_from_obj_folio(folio, p);
+}
+
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
+ * allocated using vmalloc().
+ *
+ * A passed kernel object must be a slab object or a generic kernel page.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_slab_obj(void *p)
+{
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       return mem_cgroup_from_obj_folio(virt_to_folio(p), p);
+}
+
  static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
  {
         struct obj_cgroup *objcg = NULL;
@@ -3402,8 +3443,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
                         continue;
                 }
  
-               if (!try_to_free_mem_cgroup_pages(memcg, 1,
-                                       GFP_KERNEL, !memsw)) {
+               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+                                       memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
                         ret = -EBUSY;
                         break;
                 }
@@ -3513,7 +3554,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
                 if (signal_pending(current))
                         return -EINTR;
  
-               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
+               if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
+                                                 MEMCG_RECLAIM_MAY_SWAP))
                         nr_retries--;
         }
  
@@ -3625,7 +3667,7 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
  {
         struct obj_cgroup *objcg;
  
-       if (cgroup_memory_nokmem)
+       if (mem_cgroup_kmem_disabled())
                 return 0;
  
         if (unlikely(mem_cgroup_is_root(memcg)))
@@ -3649,7 +3691,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
  {
         struct mem_cgroup *parent;
  
-       if (cgroup_memory_nokmem)
+       if (mem_cgroup_kmem_disabled())
                 return;
  
         if (unlikely(mem_cgroup_is_root(memcg)))
@@ -5060,6 +5102,29 @@ struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
         return idr_find(&mem_cgroup_idr, id);
  }
  
+#ifdef CONFIG_SHRINKER_DEBUG
+struct mem_cgroup *mem_cgroup_get_from_ino(unsigned long ino)
+{
+       struct cgroup *cgrp;
+       struct cgroup_subsys_state *css;
+       struct mem_cgroup *memcg;
+
+       cgrp = cgroup_get_from_id(ino);
+       if (!cgrp)
+               return ERR_PTR(-ENOENT);
+
+       css = cgroup_get_e_css(cgrp, &memory_cgrp_subsys);
+       if (css)
+               memcg = container_of(css, struct mem_cgroup, css);
+       else
+               memcg = ERR_PTR(-ENOENT);
+
+       cgroup_put(cgrp);
+
+       return memcg;
+}
+#endif
+
  static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
  {
         struct mem_cgroup_per_node *pn;
@@ -5665,8 +5730,8 @@ out:
   *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
   *     target for charge migration. if @target is not NULL, the entry is stored
   *     in target->ent.
- *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
- *     (so ZONE_DEVICE page and thus not on the lru).
+ *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is device memory and
+ *   thus not on the lru.
   *     For now we such page is charge like a regular page would be as for all
   *     intent and purposes it is just special memory taking the place of a
   *     regular page.
@@ -5704,7 +5769,8 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
                  */
                 if (page_memcg(page) == mc.from) {
                         ret = MC_TARGET_PAGE;
-                       if (is_device_private_page(page))
+                       if (is_device_private_page(page) ||
+                           is_device_coherent_page(page))
                                 ret = MC_TARGET_DEVICE;
                         if (target)
                                 target->page = page;
@@ -6241,7 +6307,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
                 }
  
                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
-                                                        GFP_KERNEL, true);
+                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
  
                 if (!reclaimed && !nr_retries--)
                         break;
@@ -6290,7 +6356,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
  
                 if (nr_reclaims) {
                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
-                                                         GFP_KERNEL, true))
+                                       GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
                                 nr_reclaims--;
                         continue;
                 }
@@ -6335,11 +6401,11 @@ static int memory_events_local_show(struct seq_file *m, void *v)
  static int memory_stat_show(struct seq_file *m, void *v)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-       char *buf;
+       char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
  
-       buf = memory_stat_format(memcg);
         if (!buf)
                 return -ENOMEM;
+       memory_stat_format(memcg, buf, PAGE_SIZE);
         seq_puts(m, buf);
         kfree(buf);
         return 0;
@@ -6419,6 +6485,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
         unsigned long nr_to_reclaim, nr_reclaimed = 0;
+       unsigned int reclaim_options;
         int err;
  
         buf = strstrip(buf);
@@ -6426,6 +6493,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
         if (err)
                 return err;
  
+       reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
         while (nr_reclaimed < nr_to_reclaim) {
                 unsigned long reclaimed;
  
@@ -6442,7 +6510,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
  
                 reclaimed = try_to_free_mem_cgroup_pages(memcg,
                                                 nr_to_reclaim - nr_reclaimed,
-                                               GFP_KERNEL, true);
+                                               GFP_KERNEL, reclaim_options);
  
                 if (!reclaimed && !nr_retries--)
                         return -EAGAIN;