mm: kmem: rename (__)memcg_kmem_(un)charge_memcg() to __memcg_kmem_(un)charge()
[linux-2.6-microblaze.git] / mm / memcontrol.c
index d09776c..e6043ab 100644 (file)
@@ -334,7 +334,7 @@ static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
                if (!old)
                        return 0;
 
-               new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
+               new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
                if (!new)
                        return -ENOMEM;
 
@@ -378,7 +378,7 @@ static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
        mutex_lock(&memcg_shrinker_map_mutex);
        size = memcg_shrinker_map_size;
        for_each_node(nid) {
-               map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
+               map = kvzalloc_node(sizeof(*map) + size, GFP_KERNEL, nid);
                if (!map) {
                        memcg_free_shrinker_maps(memcg);
                        ret = -ENOMEM;
@@ -759,13 +759,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 
 void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
 {
-       struct page *page = virt_to_head_page(p);
-       pg_data_t *pgdat = page_pgdat(page);
+       pg_data_t *pgdat = page_pgdat(virt_to_page(p));
        struct mem_cgroup *memcg;
        struct lruvec *lruvec;
 
        rcu_read_lock();
-       memcg = memcg_from_slab_page(page);
+       memcg = mem_cgroup_from_obj(p);
 
        /* Untracked pages have no memcg, no lruvec. Update only the node */
        if (!memcg || memcg == root_mem_cgroup) {
@@ -777,6 +776,17 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
        rcu_read_unlock();
 }
 
+void mod_memcg_obj_state(void *p, int idx, int val)
+{
+       struct mem_cgroup *memcg;
+
+       rcu_read_lock();
+       memcg = mem_cgroup_from_obj(p);
+       if (memcg)
+               mod_memcg_state(memcg, idx, val);
+       rcu_read_unlock();
+}
+
 /**
  * __count_memcg_events - account VM events in a cgroup
  * @memcg: the memory cgroup
@@ -2297,28 +2307,41 @@ static void high_work_func(struct work_struct *work)
  #define MEMCG_DELAY_SCALING_SHIFT 14
 
 /*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
  */
-void mem_cgroup_handle_over_high(void)
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+                                         unsigned int nr_pages)
 {
-       unsigned long usage, high, clamped_high;
-       unsigned long pflags;
-       unsigned long penalty_jiffies, overage;
-       unsigned int nr_pages = current->memcg_nr_pages_over_high;
-       struct mem_cgroup *memcg;
+       unsigned long penalty_jiffies;
+       u64 max_overage = 0;
 
-       if (likely(!nr_pages))
-               return;
+       do {
+               unsigned long usage, high;
+               u64 overage;
 
-       memcg = get_mem_cgroup_from_mm(current->mm);
-       reclaim_high(memcg, nr_pages, GFP_KERNEL);
-       current->memcg_nr_pages_over_high = 0;
+               usage = page_counter_read(&memcg->memory);
+               high = READ_ONCE(memcg->high);
+
+               /*
+                * Prevent division by 0 in overage calculation by acting as if
+                * it was a threshold of 1 page
+                */
+               high = max(high, 1UL);
+
+               overage = usage - high;
+               overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+               overage = div64_u64(overage, high);
+
+               if (overage > max_overage)
+                       max_overage = overage;
+       } while ((memcg = parent_mem_cgroup(memcg)) &&
+                !mem_cgroup_is_root(memcg));
+
+       if (!max_overage)
+               return 0;
 
        /*
-        * memory.high is breached and reclaim is unable to keep up. Throttle
-        * allocators proactively to slow down excessive growth.
-        *
         * We use overage compared to memory.high to calculate the number of
         * jiffies to sleep (penalty_jiffies). Ideally this value should be
         * fairly lenient on small overages, and increasingly harsh when the
@@ -2326,24 +2349,9 @@ void mem_cgroup_handle_over_high(void)
         * its crazy behaviour, so we exponentially increase the delay based on
         * overage amount.
         */
-
-       usage = page_counter_read(&memcg->memory);
-       high = READ_ONCE(memcg->high);
-
-       if (usage <= high)
-               goto out;
-
-       /*
-        * Prevent division by 0 in overage calculation by acting as if it was a
-        * threshold of 1 page
-        */
-       clamped_high = max(high, 1UL);
-
-       overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
-                         clamped_high);
-
-       penalty_jiffies = ((u64)overage * overage * HZ)
-               >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+       penalty_jiffies = max_overage * max_overage * HZ;
+       penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+       penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
 
        /*
         * Factor in the task's own contribution to the overage, such that four
@@ -2360,7 +2368,32 @@ void mem_cgroup_handle_over_high(void)
         * application moving forwards and also permit diagnostics, albeit
         * extremely slowly.
         */
-       penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+       return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+       unsigned long penalty_jiffies;
+       unsigned long pflags;
+       unsigned int nr_pages = current->memcg_nr_pages_over_high;
+       struct mem_cgroup *memcg;
+
+       if (likely(!nr_pages))
+               return;
+
+       memcg = get_mem_cgroup_from_mm(current->mm);
+       reclaim_high(memcg, nr_pages, GFP_KERNEL);
+       current->memcg_nr_pages_over_high = 0;
+
+       /*
+        * memory.high is breached and reclaim is unable to keep up. Throttle
+        * allocators proactively to slow down excessive growth.
+        */
+       penalty_jiffies = calculate_high_delay(memcg, nr_pages);
 
        /*
         * Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2638,6 +2671,33 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+       struct page *page;
+
+       if (mem_cgroup_disabled())
+               return NULL;
+
+       page = virt_to_head_page(p);
+
+       /*
+        * Slab pages don't have page->mem_cgroup set because corresponding
+        * kmem caches can be reparented during the lifetime. That's why
+        * memcg_from_slab_page() should be used instead.
+        */
+       if (PageSlab(page))
+               return memcg_from_slab_page(page);
+
+       /* All other pages use page->mem_cgroup */
+       return page->mem_cgroup;
+}
+
 static int memcg_alloc_cache_id(void)
 {
        int id, size;
@@ -2821,18 +2881,16 @@ void memcg_kmem_put_cache(struct kmem_cache *cachep)
 }
 
 /**
- * __memcg_kmem_charge_memcg: charge a kmem page
- * @page: page to charge
- * @gfp: reclaim mode
- * @order: allocation order
+ * __memcg_kmem_charge: charge a number of kernel pages to a memcg
  * @memcg: memory cgroup to charge
+ * @gfp: reclaim mode
+ * @nr_pages: number of pages to charge
  *
  * Returns 0 on success, an error code on failure.
  */
-int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
-                           struct mem_cgroup *memcg)
+int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp,
+                       unsigned int nr_pages)
 {
-       unsigned int nr_pages = 1 << order;
        struct page_counter *counter;
        int ret;
 
@@ -2859,14 +2917,29 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 }
 
 /**
- * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_uncharge: uncharge a number of kernel pages from a memcg
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+               page_counter_uncharge(&memcg->kmem, nr_pages);
+
+       page_counter_uncharge(&memcg->memory, nr_pages);
+       if (do_memsw_account())
+               page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+
+/**
+ * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
  * @page: page to charge
  * @gfp: reclaim mode
  * @order: allocation order
  *
  * Returns 0 on success, an error code on failure.
  */
-int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
 {
        struct mem_cgroup *memcg;
        int ret = 0;
@@ -2876,7 +2949,7 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 
        memcg = get_mem_cgroup_from_current();
        if (!mem_cgroup_is_root(memcg)) {
-               ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+               ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
                if (!ret) {
                        page->mem_cgroup = memcg;
                        __SetPageKmemcg(page);
@@ -2887,26 +2960,11 @@ int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
 }
 
 /**
- * __memcg_kmem_uncharge_memcg: uncharge a kmem page
- * @memcg: memcg to uncharge
- * @nr_pages: number of pages to uncharge
- */
-void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
-                                unsigned int nr_pages)
-{
-       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
-               page_counter_uncharge(&memcg->kmem, nr_pages);
-
-       page_counter_uncharge(&memcg->memory, nr_pages);
-       if (do_memsw_account())
-               page_counter_uncharge(&memcg->memsw, nr_pages);
-}
-/**
- * __memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge_page: uncharge a kmem page
  * @page: page to uncharge
  * @order: allocation order
  */
-void __memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge_page(struct page *page, int order)
 {
        struct mem_cgroup *memcg = page->mem_cgroup;
        unsigned int nr_pages = 1 << order;
@@ -2915,7 +2973,7 @@ void __memcg_kmem_uncharge(struct page *page, int order)
                return;
 
        VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-       __memcg_kmem_uncharge_memcg(memcg, nr_pages);
+       __memcg_kmem_uncharge(memcg, nr_pages);
        page->mem_cgroup = NULL;
 
        /* slab pages do not have PageKmemcg flag set */
@@ -4027,7 +4085,7 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        struct mem_cgroup_thresholds *thresholds;
        struct mem_cgroup_threshold_ary *new;
        unsigned long usage;
-       int i, j, size;
+       int i, j, size, entries;
 
        mutex_lock(&memcg->thresholds_lock);
 
@@ -4047,14 +4105,20 @@ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
        __mem_cgroup_threshold(memcg, type == _MEMSWAP);
 
        /* Calculate new number of threshold */
-       size = 0;
+       size = entries = 0;
        for (i = 0; i < thresholds->primary->size; i++) {
                if (thresholds->primary->entries[i].eventfd != eventfd)
                        size++;
+               else
+                       entries++;
        }
 
        new = thresholds->spare;
 
+       /* If no items related to eventfd have been cleared, nothing to do */
+       if (!entries)
+               goto unlock;
+
        /* Set thresholds array to NULL if we don't have thresholds */
        if (!size) {
                kfree(new);
@@ -4725,7 +4789,8 @@ static struct cftype mem_cgroup_legacy_files[] = {
                .write = mem_cgroup_reset,
                .read_u64 = mem_cgroup_read_u64,
        },
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
+#if defined(CONFIG_MEMCG_KMEM) && \
+       (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
        {
                .name = "kmem.slabinfo",
                .seq_start = memcg_slab_start,
@@ -6682,19 +6747,9 @@ void mem_cgroup_sk_alloc(struct sock *sk)
        if (!mem_cgroup_sockets_enabled)
                return;
 
-       /*
-        * Socket cloning can throw us here with sk_memcg already
-        * filled. It won't however, necessarily happen from
-        * process context. So the test for root memcg given
-        * the current task's memcg won't help us in this case.
-        *
-        * Respecting the original socket's memcg is a better
-        * decision in this case.
-        */
-       if (sk->sk_memcg) {
-               css_get(&sk->sk_memcg->css);
+       /* Do not associate the sock with unrelated interrupted task's memcg. */
+       if (in_interrupt())
                return;
-       }
 
        rcu_read_lock();
        memcg = mem_cgroup_from_task(current);