hugetlb: memcg: account hugetlb-backed memory in memory controller

[linux-2.6-microblaze.git] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index a4d3282..a86e7b4 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -704,6 +704,8 @@ static const unsigned int memcg_vm_event_stat[] = {
  #ifdef CONFIG_TRANSPARENT_HUGEPAGE
         THP_FAULT_ALLOC,
         THP_COLLAPSE_ALLOC,
+       THP_SWPOUT,
+       THP_SWPOUT_FALLBACK,
  #endif
  };
  
@@ -761,6 +763,22 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
         return x;
  }
  
+static int memcg_page_state_unit(int item);
+
+/*
+ * Normalize the value passed into memcg_rstat_updated() to be in pages. Round
+ * up non-zero sub-page updates to 1 page as zero page updates are ignored.
+ */
+static int memcg_state_val_in_pages(int idx, int val)
+{
+       int unit = memcg_page_state_unit(idx);
+
+       if (!val || unit == PAGE_SIZE)
+               return val;
+       else
+               return max(val * unit / PAGE_SIZE, 1UL);
+}
+
  /**
   * __mod_memcg_state - update cgroup memory statistics
   * @memcg: the memory cgroup
@@ -773,7 +791,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
                 return;
  
         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
-       memcg_rstat_updated(memcg, val);
+       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
  }
  
  /* idx can be of type enum memcg_stat_item or node_stat_item. */
@@ -824,7 +842,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
         /* Update lruvec */
         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
  
-       memcg_rstat_updated(memcg, val);
+       memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val));
         memcg_stats_unlock();
  }
  
@@ -1081,6 +1099,27 @@ static __always_inline bool memcg_kmem_bypass(void)
         return false;
  }
  
+/**
+ * get_mem_cgroup_from_current - Obtain a reference on current task's memcg.
+ */
+struct mem_cgroup *get_mem_cgroup_from_current(void)
+{
+       struct mem_cgroup *memcg;
+
+       if (mem_cgroup_disabled())
+               return NULL;
+
+again:
+       rcu_read_lock();
+       memcg = mem_cgroup_from_task(current);
+       if (!css_tryget(&memcg->css)) {
+               rcu_read_unlock();
+               goto again;
+       }
+       rcu_read_unlock();
+       return memcg;
+}
+
  /**
   * mem_cgroup_iter - iterate over memory cgroup hierarchy
   * @root: hierarchy root
@@ -1533,7 +1572,7 @@ static const struct memory_stat memory_stats[] = {
         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
  };
  
-/* Translate stat items to the correct unit for memory.stat output */
+/* The actual unit of the state item, not the same as the output unit */
  static int memcg_page_state_unit(int item)
  {
         switch (item) {
@@ -1541,6 +1580,22 @@ static int memcg_page_state_unit(int item)
         case MEMCG_ZSWAP_B:
         case NR_SLAB_RECLAIMABLE_B:
         case NR_SLAB_UNRECLAIMABLE_B:
+               return 1;
+       case NR_KERNEL_STACK_KB:
+               return SZ_1K;
+       default:
+               return PAGE_SIZE;
+       }
+}
+
+/* Translate stat items to the correct unit for memory.stat output */
+static int memcg_page_state_output_unit(int item)
+{
+       /*
+        * Workingset state is actually in pages, but we export it to userspace
+        * as a scalar count of events, so special case it here.
+        */
+       switch (item) {
         case WORKINGSET_REFAULT_ANON:
         case WORKINGSET_REFAULT_FILE:
         case WORKINGSET_ACTIVATE_ANON:
@@ -1549,17 +1604,23 @@ static int memcg_page_state_unit(int item)
         case WORKINGSET_RESTORE_FILE:
         case WORKINGSET_NODERECLAIM:
                 return 1;
-       case NR_KERNEL_STACK_KB:
-               return SZ_1K;
         default:
-               return PAGE_SIZE;
+               return memcg_page_state_unit(item);
         }
  }
  
  static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
                                                     int item)
  {
-       return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
+       return memcg_page_state(memcg, item) *
+               memcg_page_state_output_unit(item);
+}
+
+static inline unsigned long memcg_page_state_local_output(
+               struct mem_cgroup *memcg, int item)
+{
+       return memcg_page_state_local(memcg, item) *
+               memcg_page_state_output_unit(item);
  }
  
  static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
@@ -2555,7 +2616,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
   * Scheduled by try_charge() to be executed from the userland return path
   * and reclaims memory over the high limit.
   */
-void mem_cgroup_handle_over_high(void)
+void mem_cgroup_handle_over_high(gfp_t gfp_mask)
  {
         unsigned long penalty_jiffies;
         unsigned long pflags;
@@ -2583,7 +2644,7 @@ retry_reclaim:
          */
         nr_reclaimed = reclaim_high(memcg,
                                     in_retry ? SWAP_CLUSTER_MAX : nr_pages,
-                                   GFP_KERNEL);
+                                   gfp_mask);
  
         /*
          * memory.high is breached and reclaim is unable to keep up. Throttle
@@ -2819,7 +2880,7 @@ done_restock:
         if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
             !(current->flags & PF_MEMALLOC) &&
             gfpflags_allow_blocking(gfp_mask)) {
-               mem_cgroup_handle_over_high();
+               mem_cgroup_handle_over_high(gfp_mask);
         }
         return 0;
  }
@@ -2833,7 +2894,12 @@ static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
         return try_charge_memcg(memcg, gfp_mask, nr_pages);
  }
  
-static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
+/**
+ * mem_cgroup_cancel_charge() - cancel an uncommitted try_charge() call.
+ * @memcg: memcg previously charged.
+ * @nr_pages: number of pages previously charged.
+ */
+void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
  {
         if (mem_cgroup_is_root(memcg))
                 return;
@@ -2858,6 +2924,22 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
         folio->memcg_data = (unsigned long)memcg;
  }
  
+/**
+ * mem_cgroup_commit_charge - commit a previously successful try_charge().
+ * @folio: folio to commit the charge to.
+ * @memcg: memcg previously charged.
+ */
+void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+       css_get(&memcg->css);
+       commit_charge(folio, memcg);
+
+       local_irq_disable();
+       mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
+       memcg_check_events(memcg, folio_nid(folio));
+       local_irq_enable();
+}
+
  #ifdef CONFIG_MEMCG_KMEM
  /*
   * The allocated objcg pointers array is not accounted directly.
@@ -3867,6 +3949,13 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                 case _MEMSWAP:
                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
                         break;
+               case _KMEM:
+                       pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+                                    "Writing any value to this file has no effect. "
+                                    "Please report your usecase to linux-mm@kvack.org if you "
+                                    "depend on this functionality.\n");
+                       ret = 0;
+                       break;
                 case _TCP:
                         ret = memcg_update_tcp_max(memcg, nr_pages);
                         break;
@@ -4059,7 +4148,10 @@ static const unsigned int memcg1_stats[] = {
         NR_WRITEBACK,
         WORKINGSET_REFAULT_ANON,
         WORKINGSET_REFAULT_FILE,
+#ifdef CONFIG_SWAP
         MEMCG_SWAP,
+       NR_SWAPCACHE,
+#endif
  };
  
  static const char *const memcg1_stat_names[] = {
@@ -4074,7 +4166,10 @@ static const char *const memcg1_stat_names[] = {
         "writeback",
         "workingset_refault_anon",
         "workingset_refault_file",
+#ifdef CONFIG_SWAP
         "swap",
+       "swapcached",
+#endif
  };
  
  /* Universal VM events cgroup1 shows, original sort order */
@@ -4098,11 +4193,8 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
  
-               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
-                       continue;
-               nr = memcg_page_state_local(memcg, memcg1_stats[i]);
-               seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i],
-                          nr * memcg_page_state_unit(memcg1_stats[i]));
+               nr = memcg_page_state_local_output(memcg, memcg1_stats[i]);
+               seq_buf_printf(s, "%s %lu\n", memcg1_stat_names[i], nr);
         }
  
         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -4122,18 +4214,15 @@ static void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
         }
         seq_buf_printf(s, "hierarchical_memory_limit %llu\n",
                        (u64)memory * PAGE_SIZE);
-       if (do_memsw_account())
-               seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
-                              (u64)memsw * PAGE_SIZE);
+       seq_buf_printf(s, "hierarchical_memsw_limit %llu\n",
+                      (u64)memsw * PAGE_SIZE);
  
         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
                 unsigned long nr;
  
-               if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
-                       continue;
-               nr = memcg_page_state(memcg, memcg1_stats[i]);
+               nr = memcg_page_state_output(memcg, memcg1_stats[i]);
                 seq_buf_printf(s, "total_%s %llu\n", memcg1_stat_names[i],
-                          (u64)nr * memcg_page_state_unit(memcg1_stats[i]));
+                              (u64)nr);
         }
  
         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -5077,6 +5166,12 @@ static struct cftype mem_cgroup_legacy_files[] = {
                 .seq_show = memcg_numa_stat_show,
         },
  #endif
+       {
+               .name = "kmem.limit_in_bytes",
+               .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
+               .write = mem_cgroup_write,
+               .read_u64 = mem_cgroup_read_u64,
+       },
         {
                 .name = "kmem.usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
@@ -6063,7 +6158,7 @@ static void __mem_cgroup_clear_mc(void)
  
         /* we must uncharge all the leftover precharges from mc.to */
         if (mc.precharge) {
-               cancel_charge(mc.to, mc.precharge);
+               mem_cgroup_cancel_charge(mc.to, mc.precharge);
                 mc.precharge = 0;
         }
         /*
@@ -6071,7 +6166,7 @@ static void __mem_cgroup_clear_mc(void)
          * we must uncharge here.
          */
         if (mc.moved_charge) {
-               cancel_charge(mc.from, mc.moved_charge);
+               mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
                 mc.moved_charge = 0;
         }
         /* we must fixup refcnts and charges */
@@ -6609,7 +6704,8 @@ static int memory_stat_show(struct seq_file *m, void *v)
  static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
                                                      int item)
  {
-       return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
+       return lruvec_page_state(lruvec, item) *
+               memcg_page_state_output_unit(item);
  }
  
  static int memory_numa_stat_show(struct seq_file *m, void *v)
@@ -6977,20 +7073,13 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
  static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
                         gfp_t gfp)
  {
-       long nr_pages = folio_nr_pages(folio);
         int ret;
  
-       ret = try_charge(memcg, gfp, nr_pages);
+       ret = try_charge(memcg, gfp, folio_nr_pages(folio));
         if (ret)
                 goto out;
  
-       css_get(&memcg->css);
-       commit_charge(folio, memcg);
-
-       local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, nr_pages);
-       memcg_check_events(memcg, folio_nid(folio));
-       local_irq_enable();
+       mem_cgroup_commit_charge(folio, memcg);
  out:
         return ret;
  }
@@ -7007,6 +7096,41 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
         return ret;
  }
  
+/**
+ * mem_cgroup_hugetlb_try_charge - try to charge the memcg for a hugetlb folio
+ * @memcg: memcg to charge.
+ * @gfp: reclaim mode.
+ * @nr_pages: number of pages to charge.
+ *
+ * This function is called when allocating a huge page folio to determine if
+ * the memcg has the capacity for it. It does not commit the charge yet,
+ * as the hugetlb folio itself has not been obtained from the hugetlb pool.
+ *
+ * Once we have obtained the hugetlb folio, we can call
+ * mem_cgroup_commit_charge() to commit the charge. If we fail to obtain the
+ * folio, we should instead call mem_cgroup_cancel_charge() to undo the effect
+ * of try_charge().
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_hugetlb_try_charge(struct mem_cgroup *memcg, gfp_t gfp,
+                       long nr_pages)
+{
+       /*
+        * If hugetlb memcg charging is not enabled, do not fail hugetlb allocation,
+        * but do not attempt to commit charge later (or cancel on error) either.
+        */
+       if (mem_cgroup_disabled() || !memcg ||
+               !cgroup_subsys_on_dfl(memory_cgrp_subsys) ||
+               !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING))
+               return -EOPNOTSUPP;
+
+       if (try_charge(memcg, gfp, nr_pages))
+               return -ENOMEM;
+
+       return 0;
+}
+
  /**
   * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin.
   * @folio: folio to charge.
@@ -7203,16 +7327,17 @@ void __mem_cgroup_uncharge_list(struct list_head *page_list)
  }
  
  /**
- * mem_cgroup_migrate - Charge a folio's replacement.
+ * mem_cgroup_replace_folio - Charge a folio's replacement.
   * @old: Currently circulating folio.
   * @new: Replacement folio.
   *
   * Charge @new as a replacement folio for @old. @old will
- * be uncharged upon free.
+ * be uncharged upon free. This is only used by the page cache
+ * (in replace_page_cache_folio()).
   *
   * Both folios must be locked, @new->mapping must be set up.
   */
-void mem_cgroup_migrate(struct folio *old, struct folio *new)
+void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
  {
         struct mem_cgroup *memcg;
         long nr_pages = folio_nr_pages(new);
@@ -7251,6 +7376,44 @@ void mem_cgroup_migrate(struct folio *old, struct folio *new)
         local_irq_restore(flags);
  }
  
+/**
+ * mem_cgroup_migrate - Transfer the memcg data from the old to the new folio.
+ * @old: Currently circulating folio.
+ * @new: Replacement folio.
+ *
+ * Transfer the memcg data from the old folio to the new folio for migration.
+ * The old folio's data info will be cleared. Note that the memory counters
+ * will remain unchanged throughout the process.
+ *
+ * Both folios must be locked, @new->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct folio *old, struct folio *new)
+{
+       struct mem_cgroup *memcg;
+
+       VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
+       VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
+       VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
+       VM_BUG_ON_FOLIO(folio_nr_pages(old) != folio_nr_pages(new), new);
+
+       if (mem_cgroup_disabled())
+               return;
+
+       memcg = folio_memcg(old);
+       /*
+        * Note that it is normal to see !memcg for a hugetlb folio.
+        * For e.g, itt could have been allocated when memory_hugetlb_accounting
+        * was not selected.
+        */
+       VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(old) && !memcg, old);
+       if (!memcg)
+               return;
+
+       /* Transfer the charge and the css ref */
+       commit_charge(new, memcg);
+       old->memcg_data = 0;
+}
+
  DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
  EXPORT_SYMBOL(memcg_sockets_enabled_key);