if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
memcg_page_state(memcg, WORKINGSET_REFAULT));
seq_buf_printf(&s, "workingset_activate %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_restore %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
gfp_t gfp_mask)
{
do {
- if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
continue;
memcg_memory_event(memcg, MEMCG_HIGH);
try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14
-/*
- * Get the number of jiffies that we should penalise a mischievous cgroup which
- * is exceeding its memory.high by checking both it and its ancestors.
- */
-static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+static u64 calculate_overage(unsigned long usage, unsigned long high)
{
- unsigned long penalty_jiffies;
- u64 max_overage = 0;
+ u64 overage;
- do {
- unsigned long usage, high;
- u64 overage;
+ if (usage <= high)
+ return 0;
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
- if (usage <= high)
- continue;
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
- /*
- * Prevent division by 0 in overage calculation by acting as if
- * it was a threshold of 1 page
- */
- high = max(high, 1UL);
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
- overage = usage - high;
- overage <<= MEMCG_DELAY_PRECISION_SHIFT;
- overage = div64_u64(overage, high);
+ return max_overage;
+}
- if (overage > max_overage)
- max_overage = overage;
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
if (!max_overage)
return 0;
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
- penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
-
- /*
- * Clamp the max delay per usermode return so as to still keep the
- * application moving forwards and also permit diagnostics, albeit
- * extremely slowly.
- */
- return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
}
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
- penalty_jiffies = calculate_high_delay(memcg, nr_pages);
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
- /* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
schedule_work(&memcg->high_work);
break;
}
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
static inline bool memcg_kmem_bypass(void)
{
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
+ if (in_interrupt())
+ return true;
+
+ /* Allow remote memcg charging in kthread contexts. */
+ if ((!current->mm || (current->flags & PF_KTHREAD)) &&
+ !current->active_memcg)
return true;
return false;
}
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
- READ_ONCE(memcg->high));
+ READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
if (IS_ERR(memcg))
return ERR_CAST(memcg);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
static int memory_high_show(struct seq_file *m, void *v)
{
- return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- WRITE_ONCE(memcg->high, high);
+ page_counter_set_high(&memcg->memory, high);
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
},
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >=
- READ_ONCE(memcg->swap.max))
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
return true;
+ }
return false;
}
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
.flags = CFTYPE_NOT_ON_ROOT,
.read_u64 = swap_current_read,
},
+ {
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
+ },
{
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,