mm: memory.low hierarchical behavior

author Roman Gushchin <guro@fb.com>

Fri, 8 Jun 2018 00:06:22 +0000 (17:06 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 8 Jun 2018 00:34:35 +0000 (17:34 -0700)
author Roman Gushchin <guro@fb.com>
Fri, 8 Jun 2018 00:06:22 +0000 (17:06 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 8 Jun 2018 00:34:35 +0000 (17:34 -0700)
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h

index 577a19a..8919455 100644 (file)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -181,8 +181,7 @@ struct mem_cgroup {
         struct page_counter kmem;
         struct page_counter tcpmem;
  
-       /* Normal memory consumption range */
-       unsigned long low;
+       /* Upper bound of normal memory consumption range */
         unsigned long high;
  
         /* Range enforcement for interrupt charges */
diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h

index 94029da..7902a72 100644 (file)
--- a/include/linux/page_counter.h
+++ b/include/linux/page_counter.h
@@ -9,8 +9,14 @@
  struct page_counter {
         atomic_long_t usage;
         unsigned long max;
+       unsigned long low;
         struct page_counter *parent;
  
+       /* effective memory.low and memory.low usage tracking */
+       unsigned long elow;
+       atomic_long_t low_usage;
+       atomic_long_t children_low_usage;
+
         /* legacy */
         unsigned long watermark;
         unsigned long failcnt;
@@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter,
                              struct page_counter **fail);
  void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages);
  int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages);
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages);
  int page_counter_memparse(const char *buf, const char *max,
                           unsigned long *nr_pages);
  
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 79bb5ae..d026105 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
         }
         spin_unlock(&memcg->event_list_lock);
  
-       memcg->low = 0;
+       page_counter_set_low(&memcg->memory, 0);
  
         memcg_offline_kmem(memcg);
         wb_memcg_offline(memcg);
@@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
  
-       memcg->low = 0;
         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
         page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
+       page_counter_set_low(&memcg->memory, 0);
         memcg->high = PAGE_COUNTER_MAX;
         memcg->soft_limit = PAGE_COUNTER_MAX;
         memcg_wb_domain_size_changed(memcg);
@@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
  static int memory_low_show(struct seq_file *m, void *v)
  {
         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
-       unsigned long low = READ_ONCE(memcg->low);
+       unsigned long low = READ_ONCE(memcg->memory.low);
  
         if (low == PAGE_COUNTER_MAX)
                 seq_puts(m, "max\n");
@@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
         if (err)
                 return err;
  
-       memcg->low = low;
+       page_counter_set_low(&memcg->memory, low);
  
         return nbytes;
  }
@@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = {
   * @root: the top ancestor of the sub-tree being checked
   * @memcg: the memory cgroup to check
   *
- * Returns %true if memory consumption of @memcg, and that of all
- * ancestors up to (but not including) @root, is below the normal range.
+ * WARNING: This function is not stateless! It can only be used as part
+ *          of a top-down tree iteration, not for isolated queries.
   *
- * @root is exclusive; it is never low when looked at directly and isn't
- * checked when traversing the hierarchy.
+ * Returns %true if memory consumption of @memcg is below the normal range.
   *
- * Excluding @root enables using memory.low to prioritize memory usage
- * between cgroups within a subtree of the hierarchy that is limited by
- * memory.high or memory.max.
+ * @root is exclusive; it is never low when looked at directly
   *
- * For example, given cgroup A with children B and C:
+ * To provide a proper hierarchical behavior, effective memory.low value
+ * is used.
   *
- *    A
- *   / \
- *  B   C
+ * Effective memory.low is always equal or less than the original memory.low.
+ * If there is no memory.low overcommittment (which is always true for
+ * top-level memory cgroups), these two values are equal.
+ * Otherwise, it's a part of parent's effective memory.low,
+ * calculated as a cgroup's memory.low usage divided by sum of sibling's
+ * memory.low usages, where memory.low usage is the size of actually
+ * protected memory.
   *
- * and
+ *                                             low_usage
+ * elow = min( memory.low, parent->elow * ------------------ ),
+ *                                        siblings_low_usage
   *
- *  1. A/memory.current > A/memory.high
- *  2. A/B/memory.current < A/B/memory.low
- *  3. A/C/memory.current >= A/C/memory.low
+ *             | memory.current, if memory.current < memory.low
+ * low_usage = |
+              | 0, otherwise.
   *
- * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we
- * should reclaim from 'C' until 'A' is no longer high or until we can
- * no longer reclaim from 'C'.  If 'A', i.e. @root, isn't excluded by
- * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered
- * low and we will reclaim indiscriminately from both 'B' and 'C'.
+ *
+ * Such definition of the effective memory.low provides the expected
+ * hierarchical behavior: parent's memory.low value is limiting
+ * children, unprotected memory is reclaimed first and cgroups,
+ * which are not using their guarantee do not affect actual memory
+ * distribution.
+ *
+ * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
+ *
+ *     A      A/memory.low = 2G, A/memory.current = 6G
+ *    //\\
+ *   BC  DE   B/memory.low = 3G  B/memory.current = 2G
+ *            C/memory.low = 1G  C/memory.current = 2G
+ *            D/memory.low = 0   D/memory.current = 2G
+ *            E/memory.low = 10G E/memory.current = 0
+ *
+ * and the memory pressure is applied, the following memory distribution
+ * is expected (approximately):
+ *
+ *     A/memory.current = 2G
+ *
+ *     B/memory.current = 1.3G
+ *     C/memory.current = 0.6G
+ *     D/memory.current = 0
+ *     E/memory.current = 0
+ *
+ * These calculations require constant tracking of the actual low usages
+ * (see propagate_low_usage()), as well as recursive calculation of
+ * effective memory.low values. But as we do call mem_cgroup_low()
+ * path for each memory cgroup top-down from the reclaim,
+ * it's possible to optimize this part, and save calculated elow
+ * for next usage. This part is intentionally racy, but it's ok,
+ * as memory.low is a best-effort mechanism.
   */
  bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
  {
+       unsigned long usage, low_usage, siblings_low_usage;
+       unsigned long elow, parent_elow;
+       struct mem_cgroup *parent;
+
         if (mem_cgroup_disabled())
                 return false;
  
@@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
         if (memcg == root)
                 return false;
  
-       for (; memcg != root; memcg = parent_mem_cgroup(memcg)) {
-               if (page_counter_read(&memcg->memory) >= memcg->low)
-                       return false;
-       }
+       elow = memcg->memory.low;
+       usage = page_counter_read(&memcg->memory);
+       parent = parent_mem_cgroup(memcg);
  
-       return true;
+       if (parent == root)
+               goto exit;
+
+       parent_elow = READ_ONCE(parent->memory.elow);
+       elow = min(elow, parent_elow);
+
+       if (!elow || !parent_elow)
+               goto exit;
+
+       low_usage = min(usage, memcg->memory.low);
+       siblings_low_usage = atomic_long_read(
+               &parent->memory.children_low_usage);
+
+       if (!low_usage || !siblings_low_usage)
+               goto exit;
+
+       elow = min(elow, parent_elow * low_usage / siblings_low_usage);
+exit:
+       memcg->memory.elow = elow;
+       return usage < elow;
  }
  
  /**
diff --git a/mm/page_counter.c b/mm/page_counter.c

index 41937c9..a5ff4cb 100644 (file)
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,28 @@
  #include <linux/bug.h>
  #include <asm/page.h>
  
+static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+{
+       unsigned long low_usage, old;
+       long delta;
+
+       if (!c->parent)
+               return;
+
+       if (!c->low && !atomic_long_read(&c->low_usage))
+               return;
+
+       if (usage <= c->low)
+               low_usage = usage;
+       else
+               low_usage = 0;
+
+       old = atomic_long_xchg(&c->low_usage, low_usage);
+       delta = low_usage - old;
+       if (delta)
+               atomic_long_add(delta, &c->parent->children_low_usage);
+}
+
  /**
   * page_counter_cancel - take pages out of the local counter
   * @counter: counter
@@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
         long new;
  
         new = atomic_long_sub_return(nr_pages, &counter->usage);
+       propagate_low_usage(counter, new);
         /* More uncharges than charges? */
         WARN_ON_ONCE(new < 0);
  }
@@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
                 long new;
  
                 new = atomic_long_add_return(nr_pages, &c->usage);
+               propagate_low_usage(counter, new);
                 /*
                  * This is indeed racy, but we can live with some
                  * inaccuracy in the watermark.
@@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter,
                 new = atomic_long_add_return(nr_pages, &c->usage);
                 if (new > c->max) {
                         atomic_long_sub(nr_pages, &c->usage);
+                       propagate_low_usage(counter, new);
                         /*
                          * This is racy, but we can live with some
                          * inaccuracy in the failcnt.
@@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter,
                         *fail = c;
                         goto failed;
                 }
+               propagate_low_usage(counter, new);
                 /*
                  * Just like with failcnt, we can live with some
                  * inaccuracy in the watermark.
@@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
         }
  }
  
+/**
+ * page_counter_set_low - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
+{
+       struct page_counter *c;
+
+       counter->low = nr_pages;
+
+       for (c = counter; c; c = c->parent)
+               propagate_low_usage(c, atomic_long_read(&c->usage));
+}
+
  /**
   * page_counter_memparse - memparse() for page counter limits
   * @buf: string to parse
author	Roman Gushchin <guro@fb.com>
	Fri, 8 Jun 2018 00:06:22 +0000 (17:06 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 8 Jun 2018 00:34:35 +0000 (17:34 -0700)
include/linux/memcontrol.h		patch \| blob \| history
include/linux/page_counter.h		patch \| blob \| history
mm/memcontrol.c		patch \| blob \| history
mm/page_counter.c		patch \| blob \| history