struct mem_cgroup *root_mem_cgroup __read_mostly;
-#define MEM_CGROUP_RECLAIM_RETRIES 5
-
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
}
/*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
#endif
static int memcg_shrinker_map_size;
unsigned long ino = 0;
rcu_read_lock();
- if (PageSlab(page) && !PageTail(page)) {
- memcg = memcg_from_slab_page(page);
- } else {
- memcg = page->mem_cgroup;
+ memcg = page->mem_cgroup;
- /*
- * The lowest bit set means that memcg isn't a valid
- * memcg pointer, but a obj_cgroups pointer.
- * In this case the page is shared and doesn't belong
- * to any specific memory cgroup.
- */
- if ((unsigned long) memcg & 0x1UL)
- memcg = NULL;
- }
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
(u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
(u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
if (mutex_lock_killable(&oom_lock))
return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
return 0;
}
-static void reclaim_high(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ unsigned long nr_reclaimed = 0;
+
do {
+ unsigned long pflags;
+
if (page_counter_read(&memcg->memory) <=
READ_ONCE(memcg->memory.high))
continue;
+
memcg_memory_event(memcg, MEMCG_HIGH);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
}
static void high_work_func(struct work_struct *work)
{
unsigned long penalty_jiffies;
unsigned long pflags;
+ unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
+ bool in_retry = false;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
if (penalty_jiffies <= HZ / 100)
goto out;
+ /*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
/*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
+ enum oom_status oom_status;
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- enum oom_status oom_status;
+ unsigned long pflags;
if (mem_cgroup_is_root(memcg))
return 0;
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
case OOM_FAILED:
goto force;
}
#ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
page = virt_to_head_page(p);
/*
- * Slab pages don't have page->mem_cgroup set because corresponding
- * kmem caches can be reparented during the lifetime. That's why
- * memcg_from_slab_page() should be used instead.
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
*/
- if (PageSlab(page))
- return memcg_from_slab_page(page);
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- err = memcg_update_all_caches(size);
- if (!err)
- err = memcg_update_all_list_lrus(size);
+ err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
ida_simple_remove(&memcg_cache_ida, id);
}
-struct memcg_kmem_cache_create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
- struct memcg_kmem_cache_create_work *cw =
- container_of(w, struct memcg_kmem_cache_create_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
-
- memcg_create_kmem_cache(memcg, cachep);
-
- css_put(&memcg->css);
- kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct memcg_kmem_cache_create_work *cw;
-
- if (!css_tryget_online(&memcg->css))
- return;
-
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw) {
- css_put(&memcg->css);
- return;
- }
-
- cw->memcg = memcg;
- cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
- queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep,
- struct obj_cgroup **objcgp)
-{
- struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
- struct memcg_cache_array *arr;
- int kmemcg_id;
-
- VM_BUG_ON(!is_root_cache(cachep));
-
- if (memcg_kmem_bypass())
- return cachep;
-
- rcu_read_lock();
-
- if (unlikely(current->active_memcg))
- memcg = current->active_memcg;
- else
- memcg = mem_cgroup_from_task(current);
-
- if (!memcg || memcg == root_mem_cgroup)
- goto out_unlock;
-
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
- if (kmemcg_id < 0)
- goto out_unlock;
-
- arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match the data dependency
- * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
- */
- memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
- *
- * If the memcg is dying or memcg_cache is about to be released,
- * don't bother creating new kmem_caches. Because memcg_cachep
- * is ZEROed as the fist step of kmem offlining, we don't need
- * percpu_ref_tryget_live() here. css_tryget_online() check in
- * memcg_schedule_kmem_cache_create() will prevent us from
- * creation of a new kmem_cache.
- */
- if (unlikely(!memcg_cachep))
- memcg_schedule_kmem_cache_create(memcg, cachep);
- else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt)) {
- struct obj_cgroup *objcg = rcu_dereference(memcg->objcg);
-
- if (!objcg || !obj_cgroup_tryget(objcg)) {
- percpu_ref_put(&memcg_cachep->memcg_params.refcnt);
- goto out_unlock;
- }
-
- *objcgp = objcg;
- cachep = memcg_cachep;
- }
-out_unlock:
- rcu_read_unlock();
- return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
- if (!is_root_cache(cachep))
- percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
/**
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
- INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
if (memcg->kmem_state != KMEM_ONLINE)
return;
- /*
- * Clear the online state before clearing memcg_caches array
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
- * guarantees that no cache will be created for this cgroup
- * after we are done (see memcg_create_kmem_cache()).
- */
+
memcg->kmem_state = KMEM_ALLOCATED;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
- /*
- * Deactivate and reparent kmem_caches and objcgs.
- */
- memcg_deactivate_kmem_caches(memcg, parent);
memcg_reparent_objcgs(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
/* The following stuff does not apply to the root */
if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
- INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long high;
int err;
if (err)
return err;
- page_counter_set_high(&memcg->memory, high);
-
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
break;
}
+ page_counter_set_high(&memcg->memory, high);
+
+ memcg_wb_domain_size_changed(memcg);
+
return nbytes;
}
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- * MEMCG_PROT_NONE: cgroup memory is not protected
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
- * an unprotected supply of reclaimable memory from other cgroups.
- * MEMCG_PROT_MIN: cgroup memory is protected
*/
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
unsigned long usage, parent_usage;
struct mem_cgroup *parent;
if (mem_cgroup_disabled())
- return MEMCG_PROT_NONE;
+ return;
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
- return MEMCG_PROT_NONE;
+ return;
usage = page_counter_read(&memcg->memory);
if (!usage)
- return MEMCG_PROT_NONE;
+ return;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
- return MEMCG_PROT_NONE;
+ return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
memcg->memory.elow = READ_ONCE(memcg->memory.low);
- goto out;
+ return;
}
parent_usage = page_counter_read(&parent->memory);
READ_ONCE(memcg->memory.low),
READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
- if (usage <= memcg->memory.emin)
- return MEMCG_PROT_MIN;
- else if (usage <= memcg->memory.elow)
- return MEMCG_PROT_LOW;
- else
- return MEMCG_PROT_NONE;
}
/**
{
int cpu, node;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Kmem cache creation is mostly done with the slab_mutex held,
- * so use a workqueue with limited concurrency to avoid stalling
- * all worker threads in case lots of cgroups are created and
- * destroyed simultaneously.
- */
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
- BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);