net_sched: fix backward compatibility for TCA_KIND
[linux-2.6-microblaze.git] / mm / memcontrol.c
index 6f5c0c5..c313c49 100644 (file)
@@ -25,7 +25,7 @@
 #include <linux/page_counter.h>
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
 #include <linux/sched/mm.h>
 #include <linux/shmem_fs.h>
 #include <linux/hugetlb.h>
@@ -57,6 +57,7 @@
 #include <linux/lockdep.h>
 #include <linux/file.h>
 #include <linux/tracehook.h>
+#include <linux/psi.h>
 #include <linux/seq_buf.h>
 #include "internal.h"
 #include <net/sock.h>
@@ -87,6 +88,10 @@ int do_swap_account __read_mostly;
 #define do_swap_account                0
 #endif
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
 /* Whether legacy memory+swap accounting is active */
 static bool do_memsw_account(void)
 {
@@ -313,6 +318,7 @@ DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
 
 static int memcg_shrinker_map_size;
 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -436,14 +442,6 @@ void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
        }
 }
 
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
-       return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
 /**
  * mem_cgroup_css_from_page - css of the memcg associated with a page
  * @page: page of interest
@@ -752,15 +750,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
        /* Update memcg */
        __mod_memcg_state(memcg, idx, val);
 
+       /* Update lruvec */
+       __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
        x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
        if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
                struct mem_cgroup_per_node *pi;
 
-               /*
-                * Batch local counters to keep them in sync with
-                * the hierarchical ones.
-                */
-               __this_cpu_add(pn->lruvec_stat_local->count[idx], x);
                for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
                        atomic_long_add(x, &pi->lruvec_stat[idx]);
                x = 0;
@@ -2268,21 +2264,22 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
        for_each_online_cpu(cpu) {
                struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
                struct mem_cgroup *memcg;
+               bool flush = false;
 
+               rcu_read_lock();
                memcg = stock->cached;
-               if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
-                       continue;
-               if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
-                       css_put(&memcg->css);
-                       continue;
-               }
-               if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+               if (memcg && stock->nr_pages &&
+                   mem_cgroup_is_descendant(memcg, root_memcg))
+                       flush = true;
+               rcu_read_unlock();
+
+               if (flush &&
+                   !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
                        if (cpu == curcpu)
                                drain_local_stock(&stock->work);
                        else
                                schedule_work_on(cpu, &stock->work);
                }
-               css_put(&memcg->css);
        }
        put_cpu();
        mutex_unlock(&percpu_charge_mutex);
@@ -2356,12 +2353,68 @@ static void high_work_func(struct work_struct *work)
        reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
 }
 
+/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ *   overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ *   proposed penalty in order to reduce to a reasonable number of jiffies, and
+ *   to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ *  +-------+------------------------+
+ *  | usage | time to allocate in ms |
+ *  +-------+------------------------+
+ *  | 100M  |                      0 |
+ *  | 101M  |                      6 |
+ *  | 102M  |                     25 |
+ *  | 103M  |                     57 |
+ *  | 104M  |                    102 |
+ *  | 105M  |                    159 |
+ *  | 106M  |                    230 |
+ *  | 107M  |                    313 |
+ *  | 108M  |                    409 |
+ *  | 109M  |                    518 |
+ *  | 110M  |                    639 |
+ *  | 111M  |                    774 |
+ *  | 112M  |                    921 |
+ *  | 113M  |                   1081 |
+ *  | 114M  |                   1254 |
+ *  | 115M  |                   1439 |
+ *  | 116M  |                   1638 |
+ *  | 117M  |                   1849 |
+ *  | 118M  |                   2000 |
+ *  | 119M  |                   2000 |
+ *  | 120M  |                   2000 |
+ *  +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
 /*
  * Scheduled by try_charge() to be executed from the userland return path
  * and reclaims memory over the high limit.
  */
 void mem_cgroup_handle_over_high(void)
 {
+       unsigned long usage, high, clamped_high;
+       unsigned long pflags;
+       unsigned long penalty_jiffies, overage;
        unsigned int nr_pages = current->memcg_nr_pages_over_high;
        struct mem_cgroup *memcg;
 
@@ -2370,8 +2423,75 @@ void mem_cgroup_handle_over_high(void)
 
        memcg = get_mem_cgroup_from_mm(current->mm);
        reclaim_high(memcg, nr_pages, GFP_KERNEL);
-       css_put(&memcg->css);
        current->memcg_nr_pages_over_high = 0;
+
+       /*
+        * memory.high is breached and reclaim is unable to keep up. Throttle
+        * allocators proactively to slow down excessive growth.
+        *
+        * We use overage compared to memory.high to calculate the number of
+        * jiffies to sleep (penalty_jiffies). Ideally this value should be
+        * fairly lenient on small overages, and increasingly harsh when the
+        * memcg in question makes it clear that it has no intention of stopping
+        * its crazy behaviour, so we exponentially increase the delay based on
+        * overage amount.
+        */
+
+       usage = page_counter_read(&memcg->memory);
+       high = READ_ONCE(memcg->high);
+
+       if (usage <= high)
+               goto out;
+
+       /*
+        * Prevent division by 0 in overage calculation by acting as if it was a
+        * threshold of 1 page
+        */
+       clamped_high = max(high, 1UL);
+
+       overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+                         clamped_high);
+
+       penalty_jiffies = ((u64)overage * overage * HZ)
+               >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+       /*
+        * Factor in the task's own contribution to the overage, such that four
+        * N-sized allocations are throttled approximately the same as one
+        * 4N-sized allocation.
+        *
+        * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+        * larger the current charge patch is than that.
+        */
+       penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+       /*
+        * Clamp the max delay per usermode return so as to still keep the
+        * application moving forwards and also permit diagnostics, albeit
+        * extremely slowly.
+        */
+       penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+       /*
+        * Don't sleep if the amount of jiffies this memcg owes us is so low
+        * that it's not even worth doing, in an attempt to be nice to those who
+        * go only a small amount over their memory.high value and maybe haven't
+        * been aggressively reclaimed enough yet.
+        */
+       if (penalty_jiffies <= HZ / 100)
+               goto out;
+
+       /*
+        * If we exit early, we're guaranteed to die (since
+        * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+        * need to account for any ill-begotten jiffies to pay them off later.
+        */
+       psi_memstall_enter(&pflags);
+       schedule_timeout_killable(penalty_jiffies);
+       psi_memstall_leave(&pflags);
+
+out:
+       css_put(&memcg->css);
 }
 
 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2823,6 +2943,16 @@ int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
 
        if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
            !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+               /*
+                * Enforce __GFP_NOFAIL allocation because callers are not
+                * prepared to see failures and likely do not have any failure
+                * handling code.
+                */
+               if (gfp & __GFP_NOFAIL) {
+                       page_counter_charge(&memcg->kmem, nr_pages);
+                       return 0;
+               }
                cancel_charge(memcg, nr_pages);
                return -ENOMEM;
        }
@@ -3260,6 +3390,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
        }
 }
 
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+       unsigned long stat[MEMCG_NR_STAT];
+       struct mem_cgroup *mi;
+       int node, cpu, i;
+       int min_idx, max_idx;
+
+       if (slab_only) {
+               min_idx = NR_SLAB_RECLAIMABLE;
+               max_idx = NR_SLAB_UNRECLAIMABLE;
+       } else {
+               min_idx = 0;
+               max_idx = MEMCG_NR_STAT;
+       }
+
+       for (i = min_idx; i < max_idx; i++)
+               stat[i] = 0;
+
+       for_each_online_cpu(cpu)
+               for (i = min_idx; i < max_idx; i++)
+                       stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+       for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+               for (i = min_idx; i < max_idx; i++)
+                       atomic_long_add(stat[i], &mi->vmstats[i]);
+
+       if (!slab_only)
+               max_idx = NR_VM_NODE_STAT_ITEMS;
+
+       for_each_node(node) {
+               struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+               struct mem_cgroup_per_node *pi;
+
+               for (i = min_idx; i < max_idx; i++)
+                       stat[i] = 0;
+
+               for_each_online_cpu(cpu)
+                       for (i = min_idx; i < max_idx; i++)
+                               stat[i] += per_cpu(
+                                       pn->lruvec_stat_cpu->count[i], cpu);
+
+               for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+                       for (i = min_idx; i < max_idx; i++)
+                               atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+       }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+       unsigned long events[NR_VM_EVENT_ITEMS];
+       struct mem_cgroup *mi;
+       int cpu, i;
+
+       for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+               events[i] = 0;
+
+       for_each_online_cpu(cpu)
+               for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+                       events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+                                            cpu);
+
+       for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+               for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+                       atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
 #ifdef CONFIG_MEMCG_KMEM
 static int memcg_online_kmem(struct mem_cgroup *memcg)
 {
@@ -3309,7 +3505,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
        if (!parent)
                parent = root_mem_cgroup;
 
+       /*
+        * Deactivate and reparent kmem_caches. Then flush percpu
+        * slab statistics to have precise values at the parent and
+        * all ancestor levels. It's required to keep slab stats
+        * accurate after the reparenting of kmem_caches.
+        */
        memcg_deactivate_kmem_caches(memcg, parent);
+       memcg_flush_percpu_vmstats(memcg, true);
 
        kmemcg_id = memcg->kmemcg_id;
        BUG_ON(kmemcg_id < 0);
@@ -3437,6 +3640,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
                        ret = mem_cgroup_resize_max(memcg, nr_pages, true);
                        break;
                case _KMEM:
+                       pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+                                    "Please report your usecase to linux-mm@kvack.org if you "
+                                    "depend on this functionality.\n");
                        ret = memcg_update_kmem_max(memcg, nr_pages);
                        break;
                case _TCP:
@@ -4101,6 +4307,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+#include <trace/events/writeback.h>
+
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
 {
        return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -4184,6 +4392,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
        }
 }
 
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback.  The former
+ * trackes ownership per-page while the latter per-inode.  This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases.  For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode.  B owns the inode and
+ * A's memory is limited far below B's.  A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback.  A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction.  However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism.  When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired.  Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+                                            struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = page->mem_cgroup;
+       struct memcg_cgwb_frn *frn;
+       u64 now = get_jiffies_64();
+       u64 oldest_at = now;
+       int oldest = -1;
+       int i;
+
+       trace_track_foreign_dirty(page, wb);
+
+       /*
+        * Pick the slot to use.  If there is already a slot for @wb, keep
+        * using it.  If not replace the oldest one which isn't being
+        * written out.
+        */
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+               frn = &memcg->cgwb_frn[i];
+               if (frn->bdi_id == wb->bdi->id &&
+                   frn->memcg_id == wb->memcg_css->id)
+                       break;
+               if (time_before64(frn->at, oldest_at) &&
+                   atomic_read(&frn->done.cnt) == 1) {
+                       oldest = i;
+                       oldest_at = frn->at;
+               }
+       }
+
+       if (i < MEMCG_CGWB_FRN_CNT) {
+               /*
+                * Re-using an existing one.  Update timestamp lazily to
+                * avoid making the cacheline hot.  We want them to be
+                * reasonably up-to-date and significantly shorter than
+                * dirty_expire_interval as that's what expires the record.
+                * Use the shorter of 1s and dirty_expire_interval / 8.
+                */
+               unsigned long update_intv =
+                       min_t(unsigned long, HZ,
+                             msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+               if (time_before64(frn->at, now - update_intv))
+                       frn->at = now;
+       } else if (oldest >= 0) {
+               /* replace the oldest free one */
+               frn = &memcg->cgwb_frn[oldest];
+               frn->bdi_id = wb->bdi->id;
+               frn->memcg_id = wb->memcg_css->id;
+               frn->at = now;
+       }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+       u64 now = jiffies_64;
+       int i;
+
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+               struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+               /*
+                * If the record is older than dirty_expire_interval,
+                * writeback on it has already started.  No need to kick it
+                * off again.  Also, don't start a new one if there's
+                * already one in flight.
+                */
+               if (time_after64(frn->at, now - intv) &&
+                   atomic_read(&frn->done.cnt) == 1) {
+                       frn->at = 0;
+                       trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+                       cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+                                              WB_REASON_FOREIGN_FLUSH,
+                                              &frn->done);
+               }
+       }
+}
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4604,11 +4936,6 @@ static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
        }
 }
 
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
-       mem_cgroup_id_get_many(memcg, 1);
-}
-
 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
 {
        mem_cgroup_id_put_many(memcg, 1);
@@ -4682,6 +5009,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
        int node;
 
+       /*
+        * Flush percpu vmstats and vmevents to guarantee the value correctness
+        * on parent's and all ancestor levels.
+        */
+       memcg_flush_percpu_vmstats(memcg, false);
+       memcg_flush_percpu_vmevents(memcg);
        for_each_node(node)
                free_mem_cgroup_per_node_info(memcg, node);
        free_percpu(memcg->vmstats_percpu);
@@ -4700,6 +5033,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
        struct mem_cgroup *memcg;
        unsigned int size;
        int node;
+       int __maybe_unused i;
 
        size = sizeof(struct mem_cgroup);
        size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4743,6 +5077,14 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 #endif
 #ifdef CONFIG_CGROUP_WRITEBACK
        INIT_LIST_HEAD(&memcg->cgwb_list);
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+               memcg->cgwb_frn[i].done =
+                       __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+       INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+       memcg->deferred_split_queue.split_queue_len = 0;
 #endif
        idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
        return memcg;
@@ -4872,7 +5214,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
 {
        struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       int __maybe_unused i;
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+       for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+               wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
        if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
                static_branch_dec(&memcg_sockets_enabled_key);
 
@@ -5117,6 +5464,14 @@ static int mem_cgroup_move_account(struct page *page,
                __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
        }
 
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && !list_empty(page_deferred_list(page))) {
+               spin_lock(&from->deferred_split_queue.split_queue_lock);
+               list_del_init(page_deferred_list(page));
+               from->deferred_split_queue.split_queue_len--;
+               spin_unlock(&from->deferred_split_queue.split_queue_lock);
+       }
+#endif
        /*
         * It is safe to change page->mem_cgroup here because the page
         * is referenced, charged, and isolated - we can't race with
@@ -5125,6 +5480,17 @@ static int mem_cgroup_move_account(struct page *page,
 
        /* caller should have done css_get */
        page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       if (compound && list_empty(page_deferred_list(page))) {
+               spin_lock(&to->deferred_split_queue.split_queue_lock);
+               list_add_tail(page_deferred_list(page),
+                             &to->deferred_split_queue.split_queue);
+               to->deferred_split_queue.split_queue_len++;
+               spin_unlock(&to->deferred_split_queue.split_queue_lock);
+       }
+#endif
+
        spin_unlock_irqrestore(&from->move_lock, flags);
 
        ret = 0;
@@ -5283,17 +5649,16 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
        return 0;
 }
 
+static const struct mm_walk_ops precharge_walk_ops = {
+       .pmd_entry      = mem_cgroup_count_precharge_pte_range,
+};
+
 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
 {
        unsigned long precharge;
 
-       struct mm_walk mem_cgroup_count_precharge_walk = {
-               .pmd_entry = mem_cgroup_count_precharge_pte_range,
-               .mm = mm,
-       };
        down_read(&mm->mmap_sem);
-       walk_page_range(0, mm->highest_vm_end,
-                       &mem_cgroup_count_precharge_walk);
+       walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
        up_read(&mm->mmap_sem);
 
        precharge = mc.precharge;
@@ -5562,13 +5927,12 @@ put:                    /* get_mctgt_type() gets the page */
        return ret;
 }
 
+static const struct mm_walk_ops charge_walk_ops = {
+       .pmd_entry      = mem_cgroup_move_charge_pte_range,
+};
+
 static void mem_cgroup_move_charge(void)
 {
-       struct mm_walk mem_cgroup_move_charge_walk = {
-               .pmd_entry = mem_cgroup_move_charge_pte_range,
-               .mm = mc.mm,
-       };
-
        lru_add_drain_all();
        /*
         * Signal lock_page_memcg() to take the memcg's move_lock
@@ -5594,7 +5958,8 @@ retry:
         * When we have consumed all precharges and failed in doing
         * additional charge, the page walk just aborts.
         */
-       walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+       walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+                       NULL);
 
        up_read(&mc.mm->mmap_sem);
        atomic_dec(&mc.from->moving_account);
@@ -6296,7 +6661,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                unsigned int nr_pages = 1;
 
                if (PageTransHuge(page)) {
-                       nr_pages <<= compound_order(page);
+                       nr_pages = compound_nr(page);
                        ug->nr_huge += nr_pages;
                }
                if (PageAnon(page))
@@ -6308,7 +6673,7 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
                }
                ug->pgpgout++;
        } else {
-               ug->nr_kmem += 1 << compound_order(page);
+               ug->nr_kmem += compound_nr(page);
                __ClearPageKmemcg(page);
        }