mm/memcontrol.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /* memcontrol.c - Memory Controller
   3  *
   4  * Copyright IBM Corporation, 2007
   5  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   6  *
   7  * Copyright 2007 OpenVZ SWsoft Inc
   8  * Author: Pavel Emelianov <xemul@openvz.org>
   9  *
  10  * Memory thresholds
  11  * Copyright (C) 2009 Nokia Corporation
  12  * Author: Kirill A. Shutemov
  13  *
  14  * Kernel Memory Controller
  15  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  16  * Authors: Glauber Costa and Suleiman Souhlal
  17  *
  18  * Native page reclaim
  19  * Charge lifetime sanitation
  20  * Lockless page tracking & accounting
  21  * Unified hierarchy configuration model
  22  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  23  *
  24  * Per memcg lru locking
  25  * Copyright (C) 2020 Alibaba, Inc, Alex Shi
  26  */
  27
  28 #include <linux/page_counter.h>
  29 #include <linux/memcontrol.h>
  30 #include <linux/cgroup.h>
  31 #include <linux/pagewalk.h>
  32 #include <linux/sched/mm.h>
  33 #include <linux/shmem_fs.h>
  34 #include <linux/hugetlb.h>
  35 #include <linux/pagemap.h>
  36 #include <linux/vm_event_item.h>
  37 #include <linux/smp.h>
  38 #include <linux/page-flags.h>
  39 #include <linux/backing-dev.h>
  40 #include <linux/bit_spinlock.h>
  41 #include <linux/rcupdate.h>
  42 #include <linux/limits.h>
  43 #include <linux/export.h>
  44 #include <linux/mutex.h>
  45 #include <linux/rbtree.h>
  46 #include <linux/slab.h>
  47 #include <linux/swap.h>
  48 #include <linux/swapops.h>
  49 #include <linux/spinlock.h>
  50 #include <linux/eventfd.h>
  51 #include <linux/poll.h>
  52 #include <linux/sort.h>
  53 #include <linux/fs.h>
  54 #include <linux/seq_file.h>
  55 #include <linux/vmpressure.h>
  56 #include <linux/mm_inline.h>
  57 #include <linux/swap_cgroup.h>
  58 #include <linux/cpu.h>
  59 #include <linux/oom.h>
  60 #include <linux/lockdep.h>
  61 #include <linux/file.h>
  62 #include <linux/tracehook.h>
  63 #include <linux/psi.h>
  64 #include <linux/seq_buf.h>
  65 #include "internal.h"
  66 #include <net/sock.h>
  67 #include <net/ip.h>
  68 #include "slab.h"
  69
  70 #include <linux/uaccess.h>
  71
  72 #include <trace/events/vmscan.h>
  73
  74 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  75 EXPORT_SYMBOL(memory_cgrp_subsys);
  76
  77 struct mem_cgroup *root_mem_cgroup __read_mostly;
  78
  79 /* Active memory cgroup to use from an interrupt context */
  80 DEFINE_PER_CPU(struct mem_cgroup *, int_active_memcg);
  81 EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg);
  82
  83 /* Socket memory accounting disabled? */
  84 static bool cgroup_memory_nosocket __ro_after_init;
  85
  86 /* Kernel memory accounting disabled? */
  87 static bool cgroup_memory_nokmem __ro_after_init;
  88
  89 /* Whether the swap controller is active */
  90 #ifdef CONFIG_MEMCG_SWAP
  91 bool cgroup_memory_noswap __ro_after_init;
  92 #else
  93 #define cgroup_memory_noswap            1
  94 #endif
  95
  96 #ifdef CONFIG_CGROUP_WRITEBACK
  97 static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
  98 #endif
  99
 100 /* Whether legacy memory+swap accounting is active */
 101 static bool do_memsw_account(void)
 102 {
 103         return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
 104 }
 105
 106 #define THRESHOLDS_EVENTS_TARGET 128
 107 #define SOFTLIMIT_EVENTS_TARGET 1024
 108
 109 /*
 110  * Cgroups above their limits are maintained in a RB-Tree, independent of
 111  * their hierarchy representation
 112  */
 113
 114 struct mem_cgroup_tree_per_node {
 115         struct rb_root rb_root;
 116         struct rb_node *rb_rightmost;
 117         spinlock_t lock;
 118 };
 119
 120 struct mem_cgroup_tree {
 121         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 122 };
 123
 124 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 125
 126 /* for OOM */
 127 struct mem_cgroup_eventfd_list {
 128         struct list_head list;
 129         struct eventfd_ctx *eventfd;
 130 };
 131
 132 /*
 133  * cgroup_event represents events which userspace want to receive.
 134  */
 135 struct mem_cgroup_event {
 136         /*
 137          * memcg which the event belongs to.
 138          */
 139         struct mem_cgroup *memcg;
 140         /*
 141          * eventfd to signal userspace about the event.
 142          */
 143         struct eventfd_ctx *eventfd;
 144         /*
 145          * Each of these stored in a list by the cgroup.
 146          */
 147         struct list_head list;
 148         /*
 149          * register_event() callback will be used to add new userspace
 150          * waiter for changes related to this event.  Use eventfd_signal()
 151          * on eventfd to send notification to userspace.
 152          */
 153         int (*register_event)(struct mem_cgroup *memcg,
 154                               struct eventfd_ctx *eventfd, const char *args);
 155         /*
 156          * unregister_event() callback will be called when userspace closes
 157          * the eventfd or on cgroup removing.  This callback must be set,
 158          * if you want provide notification functionality.
 159          */
 160         void (*unregister_event)(struct mem_cgroup *memcg,
 161                                  struct eventfd_ctx *eventfd);
 162         /*
 163          * All fields below needed to unregister event when
 164          * userspace closes eventfd.
 165          */
 166         poll_table pt;
 167         wait_queue_head_t *wqh;
 168         wait_queue_entry_t wait;
 169         struct work_struct remove;
 170 };
 171
 172 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 173 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 174
 175 /* Stuffs for move charges at task migration. */
 176 /*
 177  * Types of charges to be moved.
 178  */
 179 #define MOVE_ANON       0x1U
 180 #define MOVE_FILE       0x2U
 181 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 182
 183 /* "mc" and its members are protected by cgroup_mutex */
 184 static struct move_charge_struct {
 185         spinlock_t        lock; /* for from, to */
 186         struct mm_struct  *mm;
 187         struct mem_cgroup *from;
 188         struct mem_cgroup *to;
 189         unsigned long flags;
 190         unsigned long precharge;
 191         unsigned long moved_charge;
 192         unsigned long moved_swap;
 193         struct task_struct *moving_task;        /* a task moving charges */
 194         wait_queue_head_t waitq;                /* a waitq for other context */
 195 } mc = {
 196         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 197         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 198 };
 199
 200 /*
 201  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 202  * limit reclaim to prevent infinite loops, if they ever occur.
 203  */
 204 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 205 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 206
 207 /* for encoding cft->private value on file */
 208 enum res_type {
 209         _MEM,
 210         _MEMSWAP,
 211         _OOM_TYPE,
 212         _KMEM,
 213         _TCP,
 214 };
 215
 216 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 217 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 218 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 219 /* Used for OOM notifier */
 220 #define OOM_CONTROL             (0)
 221
 222 /*
 223  * Iteration constructs for visiting all cgroups (under a tree).  If
 224  * loops are exited prematurely (break), mem_cgroup_iter_break() must
 225  * be used for reference counting.
 226  */
 227 #define for_each_mem_cgroup_tree(iter, root)            \
 228         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 229              iter != NULL;                              \
 230              iter = mem_cgroup_iter(root, iter, NULL))
 231
 232 #define for_each_mem_cgroup(iter)                       \
 233         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 234              iter != NULL;                              \
 235              iter = mem_cgroup_iter(NULL, iter, NULL))
 236
 237 static inline bool task_is_dying(void)
 238 {
 239         return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
 240                 (current->flags & PF_EXITING);
 241 }
 242
 243 /* Some nice accessors for the vmpressure. */
 244 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 245 {
 246         if (!memcg)
 247                 memcg = root_mem_cgroup;
 248         return &memcg->vmpressure;
 249 }
 250
 251 struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr)
 252 {
 253         return container_of(vmpr, struct mem_cgroup, vmpressure);
 254 }
 255
 256 #ifdef CONFIG_MEMCG_KMEM
 257 static DEFINE_SPINLOCK(objcg_lock);
 258
 259 bool mem_cgroup_kmem_disabled(void)
 260 {
 261         return cgroup_memory_nokmem;
 262 }
 263
 264 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
 265                                       unsigned int nr_pages);
 266
 267 static void obj_cgroup_release(struct percpu_ref *ref)
 268 {
 269         struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
 270         unsigned int nr_bytes;
 271         unsigned int nr_pages;
 272         unsigned long flags;
 273
 274         /*
 275          * At this point all allocated objects are freed, and
 276          * objcg->nr_charged_bytes can't have an arbitrary byte value.
 277          * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
 278          *
 279          * The following sequence can lead to it:
 280          * 1) CPU0: objcg == stock->cached_objcg
 281          * 2) CPU1: we do a small allocation (e.g. 92 bytes),
 282          *          PAGE_SIZE bytes are charged
 283          * 3) CPU1: a process from another memcg is allocating something,
 284          *          the stock if flushed,
 285          *          objcg->nr_charged_bytes = PAGE_SIZE - 92
 286          * 5) CPU0: we do release this object,
 287          *          92 bytes are added to stock->nr_bytes
 288          * 6) CPU0: stock is flushed,
 289          *          92 bytes are added to objcg->nr_charged_bytes
 290          *
 291          * In the result, nr_charged_bytes == PAGE_SIZE.
 292          * This page will be uncharged in obj_cgroup_release().
 293          */
 294         nr_bytes = atomic_read(&objcg->nr_charged_bytes);
 295         WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
 296         nr_pages = nr_bytes >> PAGE_SHIFT;
 297
 298         if (nr_pages)
 299                 obj_cgroup_uncharge_pages(objcg, nr_pages);
 300
 301         spin_lock_irqsave(&objcg_lock, flags);
 302         list_del(&objcg->list);
 303         spin_unlock_irqrestore(&objcg_lock, flags);
 304
 305         percpu_ref_exit(ref);
 306         kfree_rcu(objcg, rcu);
 307 }
 308
 309 static struct obj_cgroup *obj_cgroup_alloc(void)
 310 {
 311         struct obj_cgroup *objcg;
 312         int ret;
 313
 314         objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
 315         if (!objcg)
 316                 return NULL;
 317
 318         ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
 319                               GFP_KERNEL);
 320         if (ret) {
 321                 kfree(objcg);
 322                 return NULL;
 323         }
 324         INIT_LIST_HEAD(&objcg->list);
 325         return objcg;
 326 }
 327
 328 static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
 329                                   struct mem_cgroup *parent)
 330 {
 331         struct obj_cgroup *objcg, *iter;
 332
 333         objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
 334
 335         spin_lock_irq(&objcg_lock);
 336
 337         /* 1) Ready to reparent active objcg. */
 338         list_add(&objcg->list, &memcg->objcg_list);
 339         /* 2) Reparent active objcg and already reparented objcgs to parent. */
 340         list_for_each_entry(iter, &memcg->objcg_list, list)
 341                 WRITE_ONCE(iter->memcg, parent);
 342         /* 3) Move already reparented objcgs to the parent's list */
 343         list_splice(&memcg->objcg_list, &parent->objcg_list);
 344
 345         spin_unlock_irq(&objcg_lock);
 346
 347         percpu_ref_kill(&objcg->refcnt);
 348 }
 349
 350 /*
 351  * This will be used as a shrinker list's index.
 352  * The main reason for not using cgroup id for this:
 353  *  this works better in sparse environments, where we have a lot of memcgs,
 354  *  but only a few kmem-limited. Or also, if we have, for instance, 200
 355  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 356  *  200 entry array for that.
 357  *
 358  * The current size of the caches array is stored in memcg_nr_cache_ids. It
 359  * will double each time we have to increase it.
 360  */
 361 static DEFINE_IDA(memcg_cache_ida);
 362 int memcg_nr_cache_ids;
 363
 364 /* Protects memcg_nr_cache_ids */
 365 static DECLARE_RWSEM(memcg_cache_ids_sem);
 366
 367 void memcg_get_cache_ids(void)
 368 {
 369         down_read(&memcg_cache_ids_sem);
 370 }
 371
 372 void memcg_put_cache_ids(void)
 373 {
 374         up_read(&memcg_cache_ids_sem);
 375 }
 376
 377 /*
 378  * MIN_SIZE is different than 1, because we would like to avoid going through
 379  * the alloc/free process all the time. In a small machine, 4 kmem-limited
 380  * cgroups is a reasonable guess. In the future, it could be a parameter or
 381  * tunable, but that is strictly not necessary.
 382  *
 383  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 384  * this constant directly from cgroup, but it is understandable that this is
 385  * better kept as an internal representation in cgroup.c. In any case, the
 386  * cgrp_id space is not getting any smaller, and we don't have to necessarily
 387  * increase ours as well if it increases.
 388  */
 389 #define MEMCG_CACHES_MIN_SIZE 4
 390 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 391
 392 /*
 393  * A lot of the calls to the cache allocation functions are expected to be
 394  * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
 395  * conditional to this static branch, we'll have to allow modules that does
 396  * kmem_cache_alloc and the such to see this symbol as well
 397  */
 398 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 399 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 400 #endif
 401
 402 /**
 403  * mem_cgroup_css_from_page - css of the memcg associated with a page
 404  * @page: page of interest
 405  *
 406  * If memcg is bound to the default hierarchy, css of the memcg associated
 407  * with @page is returned.  The returned css remains associated with @page
 408  * until it is released.
 409  *
 410  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 411  * is returned.
 412  */
 413 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 414 {
 415         struct mem_cgroup *memcg;
 416
 417         memcg = page_memcg(page);
 418
 419         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 420                 memcg = root_mem_cgroup;
 421
 422         return &memcg->css;
 423 }
 424
 425 /**
 426  * page_cgroup_ino - return inode number of the memcg a page is charged to
 427  * @page: the page
 428  *
 429  * Look up the closest online ancestor of the memory cgroup @page is charged to
 430  * and return its inode number or 0 if @page is not charged to any cgroup. It
 431  * is safe to call this function without holding a reference to @page.
 432  *
 433  * Note, this function is inherently racy, because there is nothing to prevent
 434  * the cgroup inode from getting torn down and potentially reallocated a moment
 435  * after page_cgroup_ino() returns, so it only should be used by callers that
 436  * do not care (such as procfs interfaces).
 437  */
 438 ino_t page_cgroup_ino(struct page *page)
 439 {
 440         struct mem_cgroup *memcg;
 441         unsigned long ino = 0;
 442
 443         rcu_read_lock();
 444         memcg = page_memcg_check(page);
 445
 446         while (memcg && !(memcg->css.flags & CSS_ONLINE))
 447                 memcg = parent_mem_cgroup(memcg);
 448         if (memcg)
 449                 ino = cgroup_ino(memcg->css.cgroup);
 450         rcu_read_unlock();
 451         return ino;
 452 }
 453
 454 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 455                                          struct mem_cgroup_tree_per_node *mctz,
 456                                          unsigned long new_usage_in_excess)
 457 {
 458         struct rb_node **p = &mctz->rb_root.rb_node;
 459         struct rb_node *parent = NULL;
 460         struct mem_cgroup_per_node *mz_node;
 461         bool rightmost = true;
 462
 463         if (mz->on_tree)
 464                 return;
 465
 466         mz->usage_in_excess = new_usage_in_excess;
 467         if (!mz->usage_in_excess)
 468                 return;
 469         while (*p) {
 470                 parent = *p;
 471                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 472                                         tree_node);
 473                 if (mz->usage_in_excess < mz_node->usage_in_excess) {
 474                         p = &(*p)->rb_left;
 475                         rightmost = false;
 476                 } else {
 477                         p = &(*p)->rb_right;
 478                 }
 479         }
 480
 481         if (rightmost)
 482                 mctz->rb_rightmost = &mz->tree_node;
 483
 484         rb_link_node(&mz->tree_node, parent, p);
 485         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 486         mz->on_tree = true;
 487 }
 488
 489 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 490                                          struct mem_cgroup_tree_per_node *mctz)
 491 {
 492         if (!mz->on_tree)
 493                 return;
 494
 495         if (&mz->tree_node == mctz->rb_rightmost)
 496                 mctz->rb_rightmost = rb_prev(&mz->tree_node);
 497
 498         rb_erase(&mz->tree_node, &mctz->rb_root);
 499         mz->on_tree = false;
 500 }
 501
 502 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 503                                        struct mem_cgroup_tree_per_node *mctz)
 504 {
 505         unsigned long flags;
 506
 507         spin_lock_irqsave(&mctz->lock, flags);
 508         __mem_cgroup_remove_exceeded(mz, mctz);
 509         spin_unlock_irqrestore(&mctz->lock, flags);
 510 }
 511
 512 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 513 {
 514         unsigned long nr_pages = page_counter_read(&memcg->memory);
 515         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 516         unsigned long excess = 0;
 517
 518         if (nr_pages > soft_limit)
 519                 excess = nr_pages - soft_limit;
 520
 521         return excess;
 522 }
 523
 524 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, int nid)
 525 {
 526         unsigned long excess;
 527         struct mem_cgroup_per_node *mz;
 528         struct mem_cgroup_tree_per_node *mctz;
 529
 530         mctz = soft_limit_tree.rb_tree_per_node[nid];
 531         if (!mctz)
 532                 return;
 533         /*
 534          * Necessary to update all ancestors when hierarchy is used.
 535          * because their event counter is not touched.
 536          */
 537         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 538                 mz = memcg->nodeinfo[nid];
 539                 excess = soft_limit_excess(memcg);
 540                 /*
 541                  * We have to update the tree if mz is on RB-tree or
 542                  * mem is over its softlimit.
 543                  */
 544                 if (excess || mz->on_tree) {
 545                         unsigned long flags;
 546
 547                         spin_lock_irqsave(&mctz->lock, flags);
 548                         /* if on-tree, remove it */
 549                         if (mz->on_tree)
 550                                 __mem_cgroup_remove_exceeded(mz, mctz);
 551                         /*
 552                          * Insert again. mz->usage_in_excess will be updated.
 553                          * If excess is 0, no tree ops.
 554                          */
 555                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
 556                         spin_unlock_irqrestore(&mctz->lock, flags);
 557                 }
 558         }
 559 }
 560
 561 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 562 {
 563         struct mem_cgroup_tree_per_node *mctz;
 564         struct mem_cgroup_per_node *mz;
 565         int nid;
 566
 567         for_each_node(nid) {
 568                 mz = memcg->nodeinfo[nid];
 569                 mctz = soft_limit_tree.rb_tree_per_node[nid];
 570                 if (mctz)
 571                         mem_cgroup_remove_exceeded(mz, mctz);
 572         }
 573 }
 574
 575 static struct mem_cgroup_per_node *
 576 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 577 {
 578         struct mem_cgroup_per_node *mz;
 579
 580 retry:
 581         mz = NULL;
 582         if (!mctz->rb_rightmost)
 583                 goto done;              /* Nothing to reclaim from */
 584
 585         mz = rb_entry(mctz->rb_rightmost,
 586                       struct mem_cgroup_per_node, tree_node);
 587         /*
 588          * Remove the node now but someone else can add it back,
 589          * we will to add it back at the end of reclaim to its correct
 590          * position in the tree.
 591          */
 592         __mem_cgroup_remove_exceeded(mz, mctz);
 593         if (!soft_limit_excess(mz->memcg) ||
 594             !css_tryget(&mz->memcg->css))
 595                 goto retry;
 596 done:
 597         return mz;
 598 }
 599
 600 static struct mem_cgroup_per_node *
 601 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 602 {
 603         struct mem_cgroup_per_node *mz;
 604
 605         spin_lock_irq(&mctz->lock);
 606         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 607         spin_unlock_irq(&mctz->lock);
 608         return mz;
 609 }
 610
 611 /*
 612  * memcg and lruvec stats flushing
 613  *
 614  * Many codepaths leading to stats update or read are performance sensitive and
 615  * adding stats flushing in such codepaths is not desirable. So, to optimize the
 616  * flushing the kernel does:
 617  *
 618  * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
 619  *    rstat update tree grow unbounded.
 620  *
 621  * 2) Flush the stats synchronously on reader side only when there are more than
 622  *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
 623  *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
 624  *    only for 2 seconds due to (1).
 625  */
 626 static void flush_memcg_stats_dwork(struct work_struct *w);
 627 static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
 628 static DEFINE_SPINLOCK(stats_flush_lock);
 629 static DEFINE_PER_CPU(unsigned int, stats_updates);
 630 static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
 631
 632 /*
 633  * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
 634  * not rely on this as part of an acquired spinlock_t lock. These functions are
 635  * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
 636  * is sufficient.
 637  */
 638 static void memcg_stats_lock(void)
 639 {
 640 #ifdef CONFIG_PREEMPT_RT
 641       preempt_disable();
 642 #else
 643       VM_BUG_ON(!irqs_disabled());
 644 #endif
 645 }
 646
 647 static void __memcg_stats_lock(void)
 648 {
 649 #ifdef CONFIG_PREEMPT_RT
 650       preempt_disable();
 651 #endif
 652 }
 653
 654 static void memcg_stats_unlock(void)
 655 {
 656 #ifdef CONFIG_PREEMPT_RT
 657       preempt_enable();
 658 #endif
 659 }
 660
 661 static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
 662 {
 663         unsigned int x;
 664
 665         cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
 666
 667         x = __this_cpu_add_return(stats_updates, abs(val));
 668         if (x > MEMCG_CHARGE_BATCH) {
 669                 atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
 670                 __this_cpu_write(stats_updates, 0);
 671         }
 672 }
 673
 674 static void __mem_cgroup_flush_stats(void)
 675 {
 676         unsigned long flag;
 677
 678         if (!spin_trylock_irqsave(&stats_flush_lock, flag))
 679                 return;
 680
 681         cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
 682         atomic_set(&stats_flush_threshold, 0);
 683         spin_unlock_irqrestore(&stats_flush_lock, flag);
 684 }
 685
 686 void mem_cgroup_flush_stats(void)
 687 {
 688         if (atomic_read(&stats_flush_threshold) > num_online_cpus())
 689                 __mem_cgroup_flush_stats();
 690 }
 691
 692 static void flush_memcg_stats_dwork(struct work_struct *w)
 693 {
 694         __mem_cgroup_flush_stats();
 695         queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
 696 }
 697
 698 /**
 699  * __mod_memcg_state - update cgroup memory statistics
 700  * @memcg: the memory cgroup
 701  * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
 702  * @val: delta to add to the counter, can be negative
 703  */
 704 void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
 705 {
 706         if (mem_cgroup_disabled())
 707                 return;
 708
 709         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 710         memcg_rstat_updated(memcg, val);
 711 }
 712
 713 /* idx can be of type enum memcg_stat_item or node_stat_item. */
 714 static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
 715 {
 716         long x = 0;
 717         int cpu;
 718
 719         for_each_possible_cpu(cpu)
 720                 x += per_cpu(memcg->vmstats_percpu->state[idx], cpu);
 721 #ifdef CONFIG_SMP
 722         if (x < 0)
 723                 x = 0;
 724 #endif
 725         return x;
 726 }
 727
 728 void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 729                               int val)
 730 {
 731         struct mem_cgroup_per_node *pn;
 732         struct mem_cgroup *memcg;
 733
 734         pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
 735         memcg = pn->memcg;
 736
 737         /*
 738          * The caller from rmap relay on disabled preemption becase they never
 739          * update their counter from in-interrupt context. For these two
 740          * counters we check that the update is never performed from an
 741          * interrupt context while other caller need to have disabled interrupt.
 742          */
 743         __memcg_stats_lock();
 744         if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
 745                 switch (idx) {
 746                 case NR_ANON_MAPPED:
 747                 case NR_FILE_MAPPED:
 748                 case NR_ANON_THPS:
 749                 case NR_SHMEM_PMDMAPPED:
 750                 case NR_FILE_PMDMAPPED:
 751                         WARN_ON_ONCE(!in_task());
 752                         break;
 753                 default:
 754                         WARN_ON_ONCE(!irqs_disabled());
 755                 }
 756         }
 757
 758         /* Update memcg */
 759         __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
 760
 761         /* Update lruvec */
 762         __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
 763
 764         memcg_rstat_updated(memcg, val);
 765         memcg_stats_unlock();
 766 }
 767
 768 /**
 769  * __mod_lruvec_state - update lruvec memory statistics
 770  * @lruvec: the lruvec
 771  * @idx: the stat item
 772  * @val: delta to add to the counter, can be negative
 773  *
 774  * The lruvec is the intersection of the NUMA node and a cgroup. This
 775  * function updates the all three counters that are affected by a
 776  * change of state at this level: per-node, per-cgroup, per-lruvec.
 777  */
 778 void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
 779                         int val)
 780 {
 781         /* Update node */
 782         __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
 783
 784         /* Update memcg and lruvec */
 785         if (!mem_cgroup_disabled())
 786                 __mod_memcg_lruvec_state(lruvec, idx, val);
 787 }
 788
 789 void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
 790                              int val)
 791 {
 792         struct page *head = compound_head(page); /* rmap on tail pages */
 793         struct mem_cgroup *memcg;
 794         pg_data_t *pgdat = page_pgdat(page);
 795         struct lruvec *lruvec;
 796
 797         rcu_read_lock();
 798         memcg = page_memcg(head);
 799         /* Untracked pages have no memcg, no lruvec. Update only the node */
 800         if (!memcg) {
 801                 rcu_read_unlock();
 802                 __mod_node_page_state(pgdat, idx, val);
 803                 return;
 804         }
 805
 806         lruvec = mem_cgroup_lruvec(memcg, pgdat);
 807         __mod_lruvec_state(lruvec, idx, val);
 808         rcu_read_unlock();
 809 }
 810 EXPORT_SYMBOL(__mod_lruvec_page_state);
 811
 812 void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
 813 {
 814         pg_data_t *pgdat = page_pgdat(virt_to_page(p));
 815         struct mem_cgroup *memcg;
 816         struct lruvec *lruvec;
 817
 818         rcu_read_lock();
 819         memcg = mem_cgroup_from_obj(p);
 820
 821         /*
 822          * Untracked pages have no memcg, no lruvec. Update only the
 823          * node. If we reparent the slab objects to the root memcg,
 824          * when we free the slab object, we need to update the per-memcg
 825          * vmstats to keep it correct for the root memcg.
 826          */
 827         if (!memcg) {
 828                 __mod_node_page_state(pgdat, idx, val);
 829         } else {
 830                 lruvec = mem_cgroup_lruvec(memcg, pgdat);
 831                 __mod_lruvec_state(lruvec, idx, val);
 832         }
 833         rcu_read_unlock();
 834 }
 835
 836 /**
 837  * __count_memcg_events - account VM events in a cgroup
 838  * @memcg: the memory cgroup
 839  * @idx: the event item
 840  * @count: the number of events that occurred
 841  */
 842 void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
 843                           unsigned long count)
 844 {
 845         if (mem_cgroup_disabled())
 846                 return;
 847
 848         memcg_stats_lock();
 849         __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
 850         memcg_rstat_updated(memcg, count);
 851         memcg_stats_unlock();
 852 }
 853
 854 static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
 855 {
 856         return READ_ONCE(memcg->vmstats.events[event]);
 857 }
 858
 859 static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
 860 {
 861         long x = 0;
 862         int cpu;
 863
 864         for_each_possible_cpu(cpu)
 865                 x += per_cpu(memcg->vmstats_percpu->events[event], cpu);
 866         return x;
 867 }
 868
 869 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 870                                          int nr_pages)
 871 {
 872         /* pagein of a big page is an event. So, ignore page size */
 873         if (nr_pages > 0)
 874                 __count_memcg_events(memcg, PGPGIN, 1);
 875         else {
 876                 __count_memcg_events(memcg, PGPGOUT, 1);
 877                 nr_pages = -nr_pages; /* for event */
 878         }
 879
 880         __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
 881 }
 882
 883 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 884                                        enum mem_cgroup_events_target target)
 885 {
 886         unsigned long val, next;
 887
 888         val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
 889         next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
 890         /* from time_after() in jiffies.h */
 891         if ((long)(next - val) < 0) {
 892                 switch (target) {
 893                 case MEM_CGROUP_TARGET_THRESH:
 894                         next = val + THRESHOLDS_EVENTS_TARGET;
 895                         break;
 896                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 897                         next = val + SOFTLIMIT_EVENTS_TARGET;
 898                         break;
 899                 default:
 900                         break;
 901                 }
 902                 __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
 903                 return true;
 904         }
 905         return false;
 906 }
 907
 908 /*
 909  * Check events in order.
 910  *
 911  */
 912 static void memcg_check_events(struct mem_cgroup *memcg, int nid)
 913 {
 914         if (IS_ENABLED(CONFIG_PREEMPT_RT))
 915                 return;
 916
 917         /* threshold event is triggered in finer grain than soft limit */
 918         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 919                                                 MEM_CGROUP_TARGET_THRESH))) {
 920                 bool do_softlimit;
 921
 922                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 923                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 924                 mem_cgroup_threshold(memcg);
 925                 if (unlikely(do_softlimit))
 926                         mem_cgroup_update_tree(memcg, nid);
 927         }
 928 }
 929
 930 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 931 {
 932         /*
 933          * mm_update_next_owner() may clear mm->owner to NULL
 934          * if it races with swapoff, page migration, etc.
 935          * So this can be called with p == NULL.
 936          */
 937         if (unlikely(!p))
 938                 return NULL;
 939
 940         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 941 }
 942 EXPORT_SYMBOL(mem_cgroup_from_task);
 943
 944 static __always_inline struct mem_cgroup *active_memcg(void)
 945 {
 946         if (!in_task())
 947                 return this_cpu_read(int_active_memcg);
 948         else
 949                 return current->active_memcg;
 950 }
 951
 952 /**
 953  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
 954  * @mm: mm from which memcg should be extracted. It can be NULL.
 955  *
 956  * Obtain a reference on mm->memcg and returns it if successful. If mm
 957  * is NULL, then the memcg is chosen as follows:
 958  * 1) The active memcg, if set.
 959  * 2) current->mm->memcg, if available
 960  * 3) root memcg
 961  * If mem_cgroup is disabled, NULL is returned.
 962  */
 963 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 964 {
 965         struct mem_cgroup *memcg;
 966
 967         if (mem_cgroup_disabled())
 968                 return NULL;
 969
 970         /*
 971          * Page cache insertions can happen without an
 972          * actual mm context, e.g. during disk probing
 973          * on boot, loopback IO, acct() writes etc.
 974          *
 975          * No need to css_get on root memcg as the reference
 976          * counting is disabled on the root level in the
 977          * cgroup core. See CSS_NO_REF.
 978          */
 979         if (unlikely(!mm)) {
 980                 memcg = active_memcg();
 981                 if (unlikely(memcg)) {
 982                         /* remote memcg must hold a ref */
 983                         css_get(&memcg->css);
 984                         return memcg;
 985                 }
 986                 mm = current->mm;
 987                 if (unlikely(!mm))
 988                         return root_mem_cgroup;
 989         }
 990
 991         rcu_read_lock();
 992         do {
 993                 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 994                 if (unlikely(!memcg))
 995                         memcg = root_mem_cgroup;
 996         } while (!css_tryget(&memcg->css));
 997         rcu_read_unlock();
 998         return memcg;
 999 }
1000 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
1001
1002 static __always_inline bool memcg_kmem_bypass(void)
1003 {
1004         /* Allow remote memcg charging from any context. */
1005         if (unlikely(active_memcg()))
1006                 return false;
1007
1008         /* Memcg to charge can't be determined. */
1009         if (!in_task() || !current->mm || (current->flags & PF_KTHREAD))
1010                 return true;
1011
1012         return false;
1013 }
1014
1015 /**
1016  * mem_cgroup_iter - iterate over memory cgroup hierarchy
1017  * @root: hierarchy root
1018  * @prev: previously returned memcg, NULL on first invocation
1019  * @reclaim: cookie for shared reclaim walks, NULL for full walks
1020  *
1021  * Returns references to children of the hierarchy below @root, or
1022  * @root itself, or %NULL after a full round-trip.
1023  *
1024  * Caller must pass the return value in @prev on subsequent
1025  * invocations for reference counting, or use mem_cgroup_iter_break()
1026  * to cancel a hierarchy walk before the round-trip is complete.
1027  *
1028  * Reclaimers can specify a node in @reclaim to divide up the memcgs
1029  * in the hierarchy among all concurrent reclaimers operating on the
1030  * same node.
1031  */
1032 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
1033                                    struct mem_cgroup *prev,
1034                                    struct mem_cgroup_reclaim_cookie *reclaim)
1035 {
1036         struct mem_cgroup_reclaim_iter *iter;
1037         struct cgroup_subsys_state *css = NULL;
1038         struct mem_cgroup *memcg = NULL;
1039         struct mem_cgroup *pos = NULL;
1040
1041         if (mem_cgroup_disabled())
1042                 return NULL;
1043
1044         if (!root)
1045                 root = root_mem_cgroup;
1046
1047         if (prev && !reclaim)
1048                 pos = prev;
1049
1050         rcu_read_lock();
1051
1052         if (reclaim) {
1053                 struct mem_cgroup_per_node *mz;
1054
1055                 mz = root->nodeinfo[reclaim->pgdat->node_id];
1056                 iter = &mz->iter;
1057
1058                 if (prev && reclaim->generation != iter->generation)
1059                         goto out_unlock;
1060
1061                 while (1) {
1062                         pos = READ_ONCE(iter->position);
1063                         if (!pos || css_tryget(&pos->css))
1064                                 break;
1065                         /*
1066                          * css reference reached zero, so iter->position will
1067                          * be cleared by ->css_released. However, we should not
1068                          * rely on this happening soon, because ->css_released
1069                          * is called from a work queue, and by busy-waiting we
1070                          * might block it. So we clear iter->position right
1071                          * away.
1072                          */
1073                         (void)cmpxchg(&iter->position, pos, NULL);
1074                 }
1075         }
1076
1077         if (pos)
1078                 css = &pos->css;
1079
1080         for (;;) {
1081                 css = css_next_descendant_pre(css, &root->css);
1082                 if (!css) {
1083                         /*
1084                          * Reclaimers share the hierarchy walk, and a
1085                          * new one might jump in right at the end of
1086                          * the hierarchy - make sure they see at least
1087                          * one group and restart from the beginning.
1088                          */
1089                         if (!prev)
1090                                 continue;
1091                         break;
1092                 }
1093
1094                 /*
1095                  * Verify the css and acquire a reference.  The root
1096                  * is provided by the caller, so we know it's alive
1097                  * and kicking, and don't take an extra reference.
1098                  */
1099                 memcg = mem_cgroup_from_css(css);
1100
1101                 if (css == &root->css)
1102                         break;
1103
1104                 if (css_tryget(css))
1105                         break;
1106
1107                 memcg = NULL;
1108         }
1109
1110         if (reclaim) {
1111                 /*
1112                  * The position could have already been updated by a competing
1113                  * thread, so check that the value hasn't changed since we read
1114                  * it to avoid reclaiming from the same cgroup twice.
1115                  */
1116                 (void)cmpxchg(&iter->position, pos, memcg);
1117
1118                 if (pos)
1119                         css_put(&pos->css);
1120
1121                 if (!memcg)
1122                         iter->generation++;
1123                 else if (!prev)
1124                         reclaim->generation = iter->generation;
1125         }
1126
1127 out_unlock:
1128         rcu_read_unlock();
1129         if (prev && prev != root)
1130                 css_put(&prev->css);
1131
1132         return memcg;
1133 }
1134
1135 /**
1136  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1137  * @root: hierarchy root
1138  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1139  */
1140 void mem_cgroup_iter_break(struct mem_cgroup *root,
1141                            struct mem_cgroup *prev)
1142 {
1143         if (!root)
1144                 root = root_mem_cgroup;
1145         if (prev && prev != root)
1146                 css_put(&prev->css);
1147 }
1148
1149 static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
1150                                         struct mem_cgroup *dead_memcg)
1151 {
1152         struct mem_cgroup_reclaim_iter *iter;
1153         struct mem_cgroup_per_node *mz;
1154         int nid;
1155
1156         for_each_node(nid) {
1157                 mz = from->nodeinfo[nid];
1158                 iter = &mz->iter;
1159                 cmpxchg(&iter->position, dead_memcg, NULL);
1160         }
1161 }
1162
1163 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1164 {
1165         struct mem_cgroup *memcg = dead_memcg;
1166         struct mem_cgroup *last;
1167
1168         do {
1169                 __invalidate_reclaim_iterators(memcg, dead_memcg);
1170                 last = memcg;
1171         } while ((memcg = parent_mem_cgroup(memcg)));
1172
1173         /*
1174          * When cgruop1 non-hierarchy mode is used,
1175          * parent_mem_cgroup() does not walk all the way up to the
1176          * cgroup root (root_mem_cgroup). So we have to handle
1177          * dead_memcg from cgroup root separately.
1178          */
1179         if (last != root_mem_cgroup)
1180                 __invalidate_reclaim_iterators(root_mem_cgroup,
1181                                                 dead_memcg);
1182 }
1183
1184 /**
1185  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1186  * @memcg: hierarchy root
1187  * @fn: function to call for each task
1188  * @arg: argument passed to @fn
1189  *
1190  * This function iterates over tasks attached to @memcg or to any of its
1191  * descendants and calls @fn for each task. If @fn returns a non-zero
1192  * value, the function breaks the iteration loop and returns the value.
1193  * Otherwise, it will iterate over all tasks and return 0.
1194  *
1195  * This function must not be called for the root memory cgroup.
1196  */
1197 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1198                           int (*fn)(struct task_struct *, void *), void *arg)
1199 {
1200         struct mem_cgroup *iter;
1201         int ret = 0;
1202
1203         BUG_ON(memcg == root_mem_cgroup);
1204
1205         for_each_mem_cgroup_tree(iter, memcg) {
1206                 struct css_task_iter it;
1207                 struct task_struct *task;
1208
1209                 css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
1210                 while (!ret && (task = css_task_iter_next(&it)))
1211                         ret = fn(task, arg);
1212                 css_task_iter_end(&it);
1213                 if (ret) {
1214                         mem_cgroup_iter_break(memcg, iter);
1215                         break;
1216                 }
1217         }
1218         return ret;
1219 }
1220
1221 #ifdef CONFIG_DEBUG_VM
1222 void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
1223 {
1224         struct mem_cgroup *memcg;
1225
1226         if (mem_cgroup_disabled())
1227                 return;
1228
1229         memcg = folio_memcg(folio);
1230
1231         if (!memcg)
1232                 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
1233         else
1234                 VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
1235 }
1236 #endif
1237
1238 /**
1239  * folio_lruvec_lock - Lock the lruvec for a folio.
1240  * @folio: Pointer to the folio.
1241  *
1242  * These functions are safe to use under any of the following conditions:
1243  * - folio locked
1244  * - folio_test_lru false
1245  * - folio_memcg_lock()
1246  * - folio frozen (refcount of 0)
1247  *
1248  * Return: The lruvec this folio is on with its lock held.
1249  */
1250 struct lruvec *folio_lruvec_lock(struct folio *folio)
1251 {
1252         struct lruvec *lruvec = folio_lruvec(folio);
1253
1254         spin_lock(&lruvec->lru_lock);
1255         lruvec_memcg_debug(lruvec, folio);
1256
1257         return lruvec;
1258 }
1259
1260 /**
1261  * folio_lruvec_lock_irq - Lock the lruvec for a folio.
1262  * @folio: Pointer to the folio.
1263  *
1264  * These functions are safe to use under any of the following conditions:
1265  * - folio locked
1266  * - folio_test_lru false
1267  * - folio_memcg_lock()
1268  * - folio frozen (refcount of 0)
1269  *
1270  * Return: The lruvec this folio is on with its lock held and interrupts
1271  * disabled.
1272  */
1273 struct lruvec *folio_lruvec_lock_irq(struct folio *folio)
1274 {
1275         struct lruvec *lruvec = folio_lruvec(folio);
1276
1277         spin_lock_irq(&lruvec->lru_lock);
1278         lruvec_memcg_debug(lruvec, folio);
1279
1280         return lruvec;
1281 }
1282
1283 /**
1284  * folio_lruvec_lock_irqsave - Lock the lruvec for a folio.
1285  * @folio: Pointer to the folio.
1286  * @flags: Pointer to irqsave flags.
1287  *
1288  * These functions are safe to use under any of the following conditions:
1289  * - folio locked
1290  * - folio_test_lru false
1291  * - folio_memcg_lock()
1292  * - folio frozen (refcount of 0)
1293  *
1294  * Return: The lruvec this folio is on with its lock held and interrupts
1295  * disabled.
1296  */
1297 struct lruvec *folio_lruvec_lock_irqsave(struct folio *folio,
1298                 unsigned long *flags)
1299 {
1300         struct lruvec *lruvec = folio_lruvec(folio);
1301
1302         spin_lock_irqsave(&lruvec->lru_lock, *flags);
1303         lruvec_memcg_debug(lruvec, folio);
1304
1305         return lruvec;
1306 }
1307
1308 /**
1309  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1310  * @lruvec: mem_cgroup per zone lru vector
1311  * @lru: index of lru list the page is sitting on
1312  * @zid: zone id of the accounted pages
1313  * @nr_pages: positive when adding or negative when removing
1314  *
1315  * This function must be called under lru_lock, just before a page is added
1316  * to or just after a page is removed from an lru list (that ordering being
1317  * so as to allow it to check that lru_size 0 is consistent with list_empty).
1318  */
1319 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1320                                 int zid, int nr_pages)
1321 {
1322         struct mem_cgroup_per_node *mz;
1323         unsigned long *lru_size;
1324         long size;
1325
1326         if (mem_cgroup_disabled())
1327                 return;
1328
1329         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1330         lru_size = &mz->lru_zone_size[zid][lru];
1331
1332         if (nr_pages < 0)
1333                 *lru_size += nr_pages;
1334
1335         size = *lru_size;
1336         if (WARN_ONCE(size < 0,
1337                 "%s(%p, %d, %d): lru_size %ld\n",
1338                 __func__, lruvec, lru, nr_pages, size)) {
1339                 VM_BUG_ON(1);
1340                 *lru_size = 0;
1341         }
1342
1343         if (nr_pages > 0)
1344                 *lru_size += nr_pages;
1345 }
1346
1347 /**
1348  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1349  * @memcg: the memory cgroup
1350  *
1351  * Returns the maximum amount of memory @mem can be charged with, in
1352  * pages.
1353  */
1354 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1355 {
1356         unsigned long margin = 0;
1357         unsigned long count;
1358         unsigned long limit;
1359
1360         count = page_counter_read(&memcg->memory);
1361         limit = READ_ONCE(memcg->memory.max);
1362         if (count < limit)
1363                 margin = limit - count;
1364
1365         if (do_memsw_account()) {
1366                 count = page_counter_read(&memcg->memsw);
1367                 limit = READ_ONCE(memcg->memsw.max);
1368                 if (count < limit)
1369                         margin = min(margin, limit - count);
1370                 else
1371                         margin = 0;
1372         }
1373
1374         return margin;
1375 }
1376
1377 /*
1378  * A routine for checking "mem" is under move_account() or not.
1379  *
1380  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1381  * moving cgroups. This is for waiting at high-memory pressure
1382  * caused by "move".
1383  */
1384 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1385 {
1386         struct mem_cgroup *from;
1387         struct mem_cgroup *to;
1388         bool ret = false;
1389         /*
1390          * Unlike task_move routines, we access mc.to, mc.from not under
1391          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1392          */
1393         spin_lock(&mc.lock);
1394         from = mc.from;
1395         to = mc.to;
1396         if (!from)
1397                 goto unlock;
1398
1399         ret = mem_cgroup_is_descendant(from, memcg) ||
1400                 mem_cgroup_is_descendant(to, memcg);
1401 unlock:
1402         spin_unlock(&mc.lock);
1403         return ret;
1404 }
1405
1406 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1407 {
1408         if (mc.moving_task && current != mc.moving_task) {
1409                 if (mem_cgroup_under_move(memcg)) {
1410                         DEFINE_WAIT(wait);
1411                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1412                         /* moving charge context might have finished. */
1413                         if (mc.moving_task)
1414                                 schedule();
1415                         finish_wait(&mc.waitq, &wait);
1416                         return true;
1417                 }
1418         }
1419         return false;
1420 }
1421
1422 struct memory_stat {
1423         const char *name;
1424         unsigned int idx;
1425 };
1426
1427 static const struct memory_stat memory_stats[] = {
1428         { "anon",                       NR_ANON_MAPPED                  },
1429         { "file",                       NR_FILE_PAGES                   },
1430         { "kernel",                     MEMCG_KMEM                      },
1431         { "kernel_stack",               NR_KERNEL_STACK_KB              },
1432         { "pagetables",                 NR_PAGETABLE                    },
1433         { "percpu",                     MEMCG_PERCPU_B                  },
1434         { "sock",                       MEMCG_SOCK                      },
1435         { "vmalloc",                    MEMCG_VMALLOC                   },
1436         { "shmem",                      NR_SHMEM                        },
1437         { "file_mapped",                NR_FILE_MAPPED                  },
1438         { "file_dirty",                 NR_FILE_DIRTY                   },
1439         { "file_writeback",             NR_WRITEBACK                    },
1440 #ifdef CONFIG_SWAP
1441         { "swapcached",                 NR_SWAPCACHE                    },
1442 #endif
1443 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1444         { "anon_thp",                   NR_ANON_THPS                    },
1445         { "file_thp",                   NR_FILE_THPS                    },
1446         { "shmem_thp",                  NR_SHMEM_THPS                   },
1447 #endif
1448         { "inactive_anon",              NR_INACTIVE_ANON                },
1449         { "active_anon",                NR_ACTIVE_ANON                  },
1450         { "inactive_file",              NR_INACTIVE_FILE                },
1451         { "active_file",                NR_ACTIVE_FILE                  },
1452         { "unevictable",                NR_UNEVICTABLE                  },
1453         { "slab_reclaimable",           NR_SLAB_RECLAIMABLE_B           },
1454         { "slab_unreclaimable",         NR_SLAB_UNRECLAIMABLE_B         },
1455
1456         /* The memory events */
1457         { "workingset_refault_anon",    WORKINGSET_REFAULT_ANON         },
1458         { "workingset_refault_file",    WORKINGSET_REFAULT_FILE         },
1459         { "workingset_activate_anon",   WORKINGSET_ACTIVATE_ANON        },
1460         { "workingset_activate_file",   WORKINGSET_ACTIVATE_FILE        },
1461         { "workingset_restore_anon",    WORKINGSET_RESTORE_ANON         },
1462         { "workingset_restore_file",    WORKINGSET_RESTORE_FILE         },
1463         { "workingset_nodereclaim",     WORKINGSET_NODERECLAIM          },
1464 };
1465
1466 /* Translate stat items to the correct unit for memory.stat output */
1467 static int memcg_page_state_unit(int item)
1468 {
1469         switch (item) {
1470         case MEMCG_PERCPU_B:
1471         case NR_SLAB_RECLAIMABLE_B:
1472         case NR_SLAB_UNRECLAIMABLE_B:
1473         case WORKINGSET_REFAULT_ANON:
1474         case WORKINGSET_REFAULT_FILE:
1475         case WORKINGSET_ACTIVATE_ANON:
1476         case WORKINGSET_ACTIVATE_FILE:
1477         case WORKINGSET_RESTORE_ANON:
1478         case WORKINGSET_RESTORE_FILE:
1479         case WORKINGSET_NODERECLAIM:
1480                 return 1;
1481         case NR_KERNEL_STACK_KB:
1482                 return SZ_1K;
1483         default:
1484                 return PAGE_SIZE;
1485         }
1486 }
1487
1488 static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg,
1489                                                     int item)
1490 {
1491         return memcg_page_state(memcg, item) * memcg_page_state_unit(item);
1492 }
1493
1494 static char *memory_stat_format(struct mem_cgroup *memcg)
1495 {
1496         struct seq_buf s;
1497         int i;
1498
1499         seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
1500         if (!s.buffer)
1501                 return NULL;
1502
1503         /*
1504          * Provide statistics on the state of the memory subsystem as
1505          * well as cumulative event counters that show past behavior.
1506          *
1507          * This list is ordered following a combination of these gradients:
1508          * 1) generic big picture -> specifics and details
1509          * 2) reflecting userspace activity -> reflecting kernel heuristics
1510          *
1511          * Current memory state:
1512          */
1513         mem_cgroup_flush_stats();
1514
1515         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
1516                 u64 size;
1517
1518                 size = memcg_page_state_output(memcg, memory_stats[i].idx);
1519                 seq_buf_printf(&s, "%s %llu\n", memory_stats[i].name, size);
1520
1521                 if (unlikely(memory_stats[i].idx == NR_SLAB_UNRECLAIMABLE_B)) {
1522                         size += memcg_page_state_output(memcg,
1523                                                         NR_SLAB_RECLAIMABLE_B);
1524                         seq_buf_printf(&s, "slab %llu\n", size);
1525                 }
1526         }
1527
1528         /* Accumulated memory events */
1529
1530         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGFAULT),
1531                        memcg_events(memcg, PGFAULT));
1532         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
1533                        memcg_events(memcg, PGMAJFAULT));
1534         seq_buf_printf(&s, "%s %lu\n",  vm_event_name(PGREFILL),
1535                        memcg_events(memcg, PGREFILL));
1536         seq_buf_printf(&s, "pgscan %lu\n",
1537                        memcg_events(memcg, PGSCAN_KSWAPD) +
1538                        memcg_events(memcg, PGSCAN_DIRECT));
1539         seq_buf_printf(&s, "pgsteal %lu\n",
1540                        memcg_events(memcg, PGSTEAL_KSWAPD) +
1541                        memcg_events(memcg, PGSTEAL_DIRECT));
1542         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGACTIVATE),
1543                        memcg_events(memcg, PGACTIVATE));
1544         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGDEACTIVATE),
1545                        memcg_events(memcg, PGDEACTIVATE));
1546         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREE),
1547                        memcg_events(memcg, PGLAZYFREE));
1548         seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGLAZYFREED),
1549                        memcg_events(memcg, PGLAZYFREED));
1550
1551 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1552         seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_FAULT_ALLOC),
1553                        memcg_events(memcg, THP_FAULT_ALLOC));
1554         seq_buf_printf(&s, "%s %lu\n", vm_event_name(THP_COLLAPSE_ALLOC),
1555                        memcg_events(memcg, THP_COLLAPSE_ALLOC));
1556 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1557
1558         /* The above should easily fit into one page */
1559         WARN_ON_ONCE(seq_buf_has_overflowed(&s));
1560
1561         return s.buffer;
1562 }
1563
1564 #define K(x) ((x) << (PAGE_SHIFT-10))
1565 /**
1566  * mem_cgroup_print_oom_context: Print OOM information relevant to
1567  * memory controller.
1568  * @memcg: The memory cgroup that went over limit
1569  * @p: Task that is going to be killed
1570  *
1571  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1572  * enabled
1573  */
1574 void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
1575 {
1576         rcu_read_lock();
1577
1578         if (memcg) {
1579                 pr_cont(",oom_memcg=");
1580                 pr_cont_cgroup_path(memcg->css.cgroup);
1581         } else
1582                 pr_cont(",global_oom");
1583         if (p) {
1584                 pr_cont(",task_memcg=");
1585                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1586         }
1587         rcu_read_unlock();
1588 }
1589
1590 /**
1591  * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
1592  * memory controller.
1593  * @memcg: The memory cgroup that went over limit
1594  */
1595 void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
1596 {
1597         char *buf;
1598
1599         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1600                 K((u64)page_counter_read(&memcg->memory)),
1601                 K((u64)READ_ONCE(memcg->memory.max)), memcg->memory.failcnt);
1602         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
1603                 pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
1604                         K((u64)page_counter_read(&memcg->swap)),
1605                         K((u64)READ_ONCE(memcg->swap.max)), memcg->swap.failcnt);
1606         else {
1607                 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1608                         K((u64)page_counter_read(&memcg->memsw)),
1609                         K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1610                 pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1611                         K((u64)page_counter_read(&memcg->kmem)),
1612                         K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1613         }
1614
1615         pr_info("Memory cgroup stats for ");
1616         pr_cont_cgroup_path(memcg->css.cgroup);
1617         pr_cont(":");
1618         buf = memory_stat_format(memcg);
1619         if (!buf)
1620                 return;
1621         pr_info("%s", buf);
1622         kfree(buf);
1623 }
1624
1625 /*
1626  * Return the memory (and swap, if configured) limit for a memcg.
1627  */
1628 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1629 {
1630         unsigned long max = READ_ONCE(memcg->memory.max);
1631
1632         if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
1633                 if (mem_cgroup_swappiness(memcg))
1634                         max += min(READ_ONCE(memcg->swap.max),
1635                                    (unsigned long)total_swap_pages);
1636         } else { /* v1 */
1637                 if (mem_cgroup_swappiness(memcg)) {
1638                         /* Calculate swap excess capacity from memsw limit */
1639                         unsigned long swap = READ_ONCE(memcg->memsw.max) - max;
1640
1641                         max += min(swap, (unsigned long)total_swap_pages);
1642                 }
1643         }
1644         return max;
1645 }
1646
1647 unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
1648 {
1649         return page_counter_read(&memcg->memory);
1650 }
1651
1652 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1653                                      int order)
1654 {
1655         struct oom_control oc = {
1656                 .zonelist = NULL,
1657                 .nodemask = NULL,
1658                 .memcg = memcg,
1659                 .gfp_mask = gfp_mask,
1660                 .order = order,
1661         };
1662         bool ret = true;
1663
1664         if (mutex_lock_killable(&oom_lock))
1665                 return true;
1666
1667         if (mem_cgroup_margin(memcg) >= (1 << order))
1668                 goto unlock;
1669
1670         /*
1671          * A few threads which were not waiting at mutex_lock_killable() can
1672          * fail to bail out. Therefore, check again after holding oom_lock.
1673          */
1674         ret = task_is_dying() || out_of_memory(&oc);
1675
1676 unlock:
1677         mutex_unlock(&oom_lock);
1678         return ret;
1679 }
1680
1681 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1682                                    pg_data_t *pgdat,
1683                                    gfp_t gfp_mask,
1684                                    unsigned long *total_scanned)
1685 {
1686         struct mem_cgroup *victim = NULL;
1687         int total = 0;
1688         int loop = 0;
1689         unsigned long excess;
1690         unsigned long nr_scanned;
1691         struct mem_cgroup_reclaim_cookie reclaim = {
1692                 .pgdat = pgdat,
1693         };
1694
1695         excess = soft_limit_excess(root_memcg);
1696
1697         while (1) {
1698                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1699                 if (!victim) {
1700                         loop++;
1701                         if (loop >= 2) {
1702                                 /*
1703                                  * If we have not been able to reclaim
1704                                  * anything, it might because there are
1705                                  * no reclaimable pages under this hierarchy
1706                                  */
1707                                 if (!total)
1708                                         break;
1709                                 /*
1710                                  * We want to do more targeted reclaim.
1711                                  * excess >> 2 is not to excessive so as to
1712                                  * reclaim too much, nor too less that we keep
1713                                  * coming back to reclaim from this cgroup
1714                                  */
1715                                 if (total >= (excess >> 2) ||
1716                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1717                                         break;
1718                         }
1719                         continue;
1720                 }
1721                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1722                                         pgdat, &nr_scanned);
1723                 *total_scanned += nr_scanned;
1724                 if (!soft_limit_excess(root_memcg))
1725                         break;
1726         }
1727         mem_cgroup_iter_break(root_memcg, victim);
1728         return total;
1729 }
1730
1731 #ifdef CONFIG_LOCKDEP
1732 static struct lockdep_map memcg_oom_lock_dep_map = {
1733         .name = "memcg_oom_lock",
1734 };
1735 #endif
1736
1737 static DEFINE_SPINLOCK(memcg_oom_lock);
1738
1739 /*
1740  * Check OOM-Killer is already running under our hierarchy.
1741  * If someone is running, return false.
1742  */
1743 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1744 {
1745         struct mem_cgroup *iter, *failed = NULL;
1746
1747         spin_lock(&memcg_oom_lock);
1748
1749         for_each_mem_cgroup_tree(iter, memcg) {
1750                 if (iter->oom_lock) {
1751                         /*
1752                          * this subtree of our hierarchy is already locked
1753                          * so we cannot give a lock.
1754                          */
1755                         failed = iter;
1756                         mem_cgroup_iter_break(memcg, iter);
1757                         break;
1758                 } else
1759                         iter->oom_lock = true;
1760         }
1761
1762         if (failed) {
1763                 /*
1764                  * OK, we failed to lock the whole subtree so we have
1765                  * to clean up what we set up to the failing subtree
1766                  */
1767                 for_each_mem_cgroup_tree(iter, memcg) {
1768                         if (iter == failed) {
1769                                 mem_cgroup_iter_break(memcg, iter);
1770                                 break;
1771                         }
1772                         iter->oom_lock = false;
1773                 }
1774         } else
1775                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1776
1777         spin_unlock(&memcg_oom_lock);
1778
1779         return !failed;
1780 }
1781
1782 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1783 {
1784         struct mem_cgroup *iter;
1785
1786         spin_lock(&memcg_oom_lock);
1787         mutex_release(&memcg_oom_lock_dep_map, _RET_IP_);
1788         for_each_mem_cgroup_tree(iter, memcg)
1789                 iter->oom_lock = false;
1790         spin_unlock(&memcg_oom_lock);
1791 }
1792
1793 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1794 {
1795         struct mem_cgroup *iter;
1796
1797         spin_lock(&memcg_oom_lock);
1798         for_each_mem_cgroup_tree(iter, memcg)
1799                 iter->under_oom++;
1800         spin_unlock(&memcg_oom_lock);
1801 }
1802
1803 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1804 {
1805         struct mem_cgroup *iter;
1806
1807         /*
1808          * Be careful about under_oom underflows because a child memcg
1809          * could have been added after mem_cgroup_mark_under_oom.
1810          */
1811         spin_lock(&memcg_oom_lock);
1812         for_each_mem_cgroup_tree(iter, memcg)
1813                 if (iter->under_oom > 0)
1814                         iter->under_oom--;
1815         spin_unlock(&memcg_oom_lock);
1816 }
1817
1818 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1819
1820 struct oom_wait_info {
1821         struct mem_cgroup *memcg;
1822         wait_queue_entry_t      wait;
1823 };
1824
1825 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1826         unsigned mode, int sync, void *arg)
1827 {
1828         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1829         struct mem_cgroup *oom_wait_memcg;
1830         struct oom_wait_info *oom_wait_info;
1831
1832         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1833         oom_wait_memcg = oom_wait_info->memcg;
1834
1835         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1836             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1837                 return 0;
1838         return autoremove_wake_function(wait, mode, sync, arg);
1839 }
1840
1841 static void memcg_oom_recover(struct mem_cgroup *memcg)
1842 {
1843         /*
1844          * For the following lockless ->under_oom test, the only required
1845          * guarantee is that it must see the state asserted by an OOM when
1846          * this function is called as a result of userland actions
1847          * triggered by the notification of the OOM.  This is trivially
1848          * achieved by invoking mem_cgroup_mark_under_oom() before
1849          * triggering notification.
1850          */
1851         if (memcg && memcg->under_oom)
1852                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1853 }
1854
1855 /*
1856  * Returns true if successfully killed one or more processes. Though in some
1857  * corner cases it can return true even without killing any process.
1858  */
1859 static bool mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1860 {
1861         bool locked, ret;
1862
1863         if (order > PAGE_ALLOC_COSTLY_ORDER)
1864                 return false;
1865
1866         memcg_memory_event(memcg, MEMCG_OOM);
1867
1868         /*
1869          * We are in the middle of the charge context here, so we
1870          * don't want to block when potentially sitting on a callstack
1871          * that holds all kinds of filesystem and mm locks.
1872          *
1873          * cgroup1 allows disabling the OOM killer and waiting for outside
1874          * handling until the charge can succeed; remember the context and put
1875          * the task to sleep at the end of the page fault when all locks are
1876          * released.
1877          *
1878          * On the other hand, in-kernel OOM killer allows for an async victim
1879          * memory reclaim (oom_reaper) and that means that we are not solely
1880          * relying on the oom victim to make a forward progress and we can
1881          * invoke the oom killer here.
1882          *
1883          * Please note that mem_cgroup_out_of_memory might fail to find a
1884          * victim and then we have to bail out from the charge path.
1885          */
1886         if (memcg->oom_kill_disable) {
1887                 if (current->in_user_fault) {
1888                         css_get(&memcg->css);
1889                         current->memcg_in_oom = memcg;
1890                         current->memcg_oom_gfp_mask = mask;
1891                         current->memcg_oom_order = order;
1892                 }
1893                 return false;
1894         }
1895
1896         mem_cgroup_mark_under_oom(memcg);
1897
1898         locked = mem_cgroup_oom_trylock(memcg);
1899
1900         if (locked)
1901                 mem_cgroup_oom_notify(memcg);
1902
1903         mem_cgroup_unmark_under_oom(memcg);
1904         ret = mem_cgroup_out_of_memory(memcg, mask, order);
1905
1906         if (locked)
1907                 mem_cgroup_oom_unlock(memcg);
1908
1909         return ret;
1910 }
1911
1912 /**
1913  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1914  * @handle: actually kill/wait or just clean up the OOM state
1915  *
1916  * This has to be called at the end of a page fault if the memcg OOM
1917  * handler was enabled.
1918  *
1919  * Memcg supports userspace OOM handling where failed allocations must
1920  * sleep on a waitqueue until the userspace task resolves the
1921  * situation.  Sleeping directly in the charge context with all kinds
1922  * of locks held is not a good idea, instead we remember an OOM state
1923  * in the task and mem_cgroup_oom_synchronize() has to be called at
1924  * the end of the page fault to complete the OOM handling.
1925  *
1926  * Returns %true if an ongoing memcg OOM situation was detected and
1927  * completed, %false otherwise.
1928  */
1929 bool mem_cgroup_oom_synchronize(bool handle)
1930 {
1931         struct mem_cgroup *memcg = current->memcg_in_oom;
1932         struct oom_wait_info owait;
1933         bool locked;
1934
1935         /* OOM is global, do not handle */
1936         if (!memcg)
1937                 return false;
1938
1939         if (!handle)
1940                 goto cleanup;
1941
1942         owait.memcg = memcg;
1943         owait.wait.flags = 0;
1944         owait.wait.func = memcg_oom_wake_function;
1945         owait.wait.private = current;
1946         INIT_LIST_HEAD(&owait.wait.entry);
1947
1948         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1949         mem_cgroup_mark_under_oom(memcg);
1950
1951         locked = mem_cgroup_oom_trylock(memcg);
1952
1953         if (locked)
1954                 mem_cgroup_oom_notify(memcg);
1955
1956         if (locked && !memcg->oom_kill_disable) {
1957                 mem_cgroup_unmark_under_oom(memcg);
1958                 finish_wait(&memcg_oom_waitq, &owait.wait);
1959                 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1960                                          current->memcg_oom_order);
1961         } else {
1962                 schedule();
1963                 mem_cgroup_unmark_under_oom(memcg);
1964                 finish_wait(&memcg_oom_waitq, &owait.wait);
1965         }
1966
1967         if (locked) {
1968                 mem_cgroup_oom_unlock(memcg);
1969                 /*
1970                  * There is no guarantee that an OOM-lock contender
1971                  * sees the wakeups triggered by the OOM kill
1972                  * uncharges.  Wake any sleepers explicitly.
1973                  */
1974                 memcg_oom_recover(memcg);
1975         }
1976 cleanup:
1977         current->memcg_in_oom = NULL;
1978         css_put(&memcg->css);
1979         return true;
1980 }
1981
1982 /**
1983  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1984  * @victim: task to be killed by the OOM killer
1985  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1986  *
1987  * Returns a pointer to a memory cgroup, which has to be cleaned up
1988  * by killing all belonging OOM-killable tasks.
1989  *
1990  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1991  */
1992 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1993                                             struct mem_cgroup *oom_domain)
1994 {
1995         struct mem_cgroup *oom_group = NULL;
1996         struct mem_cgroup *memcg;
1997
1998         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1999                 return NULL;
2000
2001         if (!oom_domain)
2002                 oom_domain = root_mem_cgroup;
2003
2004         rcu_read_lock();
2005
2006         memcg = mem_cgroup_from_task(victim);
2007         if (memcg == root_mem_cgroup)
2008                 goto out;
2009
2010         /*
2011          * If the victim task has been asynchronously moved to a different
2012          * memory cgroup, we might end up killing tasks outside oom_domain.
2013          * In this case it's better to ignore memory.group.oom.
2014          */
2015         if (unlikely(!mem_cgroup_is_descendant(memcg, oom_domain)))
2016                 goto out;
2017
2018         /*
2019          * Traverse the memory cgroup hierarchy from the victim task's
2020          * cgroup up to the OOMing cgroup (or root) to find the
2021          * highest-level memory cgroup with oom.group set.
2022          */
2023         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
2024                 if (memcg->oom_group)
2025                         oom_group = memcg;
2026
2027                 if (memcg == oom_domain)
2028                         break;
2029         }
2030
2031         if (oom_group)
2032                 css_get(&oom_group->css);
2033 out:
2034         rcu_read_unlock();
2035
2036         return oom_group;
2037 }
2038
2039 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
2040 {
2041         pr_info("Tasks in ");
2042         pr_cont_cgroup_path(memcg->css.cgroup);
2043         pr_cont(" are going to be killed due to memory.oom.group set\n");
2044 }
2045
2046 /**
2047  * folio_memcg_lock - Bind a folio to its memcg.
2048  * @folio: The folio.
2049  *
2050  * This function prevents unlocked LRU folios from being moved to
2051  * another cgroup.
2052  *
2053  * It ensures lifetime of the bound memcg.  The caller is responsible
2054  * for the lifetime of the folio.
2055  */
2056 void folio_memcg_lock(struct folio *folio)
2057 {
2058         struct mem_cgroup *memcg;
2059         unsigned long flags;
2060
2061         /*
2062          * The RCU lock is held throughout the transaction.  The fast
2063          * path can get away without acquiring the memcg->move_lock
2064          * because page moving starts with an RCU grace period.
2065          */
2066         rcu_read_lock();
2067
2068         if (mem_cgroup_disabled())
2069                 return;
2070 again:
2071         memcg = folio_memcg(folio);
2072         if (unlikely(!memcg))
2073                 return;
2074
2075 #ifdef CONFIG_PROVE_LOCKING
2076         local_irq_save(flags);
2077         might_lock(&memcg->move_lock);
2078         local_irq_restore(flags);
2079 #endif
2080
2081         if (atomic_read(&memcg->moving_account) <= 0)
2082                 return;
2083
2084         spin_lock_irqsave(&memcg->move_lock, flags);
2085         if (memcg != folio_memcg(folio)) {
2086                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2087                 goto again;
2088         }
2089
2090         /*
2091          * When charge migration first begins, we can have multiple
2092          * critical sections holding the fast-path RCU lock and one
2093          * holding the slowpath move_lock. Track the task who has the
2094          * move_lock for unlock_page_memcg().
2095          */
2096         memcg->move_lock_task = current;
2097         memcg->move_lock_flags = flags;
2098 }
2099
2100 void lock_page_memcg(struct page *page)
2101 {
2102         folio_memcg_lock(page_folio(page));
2103 }
2104
2105 static void __folio_memcg_unlock(struct mem_cgroup *memcg)
2106 {
2107         if (memcg && memcg->move_lock_task == current) {
2108                 unsigned long flags = memcg->move_lock_flags;
2109
2110                 memcg->move_lock_task = NULL;
2111                 memcg->move_lock_flags = 0;
2112
2113                 spin_unlock_irqrestore(&memcg->move_lock, flags);
2114         }
2115
2116         rcu_read_unlock();
2117 }
2118
2119 /**
2120  * folio_memcg_unlock - Release the binding between a folio and its memcg.
2121  * @folio: The folio.
2122  *
2123  * This releases the binding created by folio_memcg_lock().  This does
2124  * not change the accounting of this folio to its memcg, but it does
2125  * permit others to change it.
2126  */
2127 void folio_memcg_unlock(struct folio *folio)
2128 {
2129         __folio_memcg_unlock(folio_memcg(folio));
2130 }
2131
2132 void unlock_page_memcg(struct page *page)
2133 {
2134         folio_memcg_unlock(page_folio(page));
2135 }
2136
2137 struct memcg_stock_pcp {
2138         local_lock_t stock_lock;
2139         struct mem_cgroup *cached; /* this never be root cgroup */
2140         unsigned int nr_pages;
2141
2142 #ifdef CONFIG_MEMCG_KMEM
2143         struct obj_cgroup *cached_objcg;
2144         struct pglist_data *cached_pgdat;
2145         unsigned int nr_bytes;
2146         int nr_slab_reclaimable_b;
2147         int nr_slab_unreclaimable_b;
2148 #endif
2149
2150         struct work_struct work;
2151         unsigned long flags;
2152 #define FLUSHING_CACHED_CHARGE  0
2153 };
2154 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
2155         .stock_lock = INIT_LOCAL_LOCK(stock_lock),
2156 };
2157 static DEFINE_MUTEX(percpu_charge_mutex);
2158
2159 #ifdef CONFIG_MEMCG_KMEM
2160 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
2161 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2162                                      struct mem_cgroup *root_memcg);
2163 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
2164
2165 #else
2166 static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
2167 {
2168         return NULL;
2169 }
2170 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
2171                                      struct mem_cgroup *root_memcg)
2172 {
2173         return false;
2174 }
2175 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2176 {
2177 }
2178 #endif
2179
2180 /**
2181  * consume_stock: Try to consume stocked charge on this cpu.
2182  * @memcg: memcg to consume from.
2183  * @nr_pages: how many pages to charge.
2184  *
2185  * The charges will only happen if @memcg matches the current cpu's memcg
2186  * stock, and at least @nr_pages are available in that stock.  Failure to
2187  * service an allocation will refill the stock.
2188  *
2189  * returns true if successful, false otherwise.
2190  */
2191 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2192 {
2193         struct memcg_stock_pcp *stock;
2194         unsigned long flags;
2195         bool ret = false;
2196
2197         if (nr_pages > MEMCG_CHARGE_BATCH)
2198                 return ret;
2199
2200         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2201
2202         stock = this_cpu_ptr(&memcg_stock);
2203         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
2204                 stock->nr_pages -= nr_pages;
2205                 ret = true;
2206         }
2207
2208         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2209
2210         return ret;
2211 }
2212
2213 /*
2214  * Returns stocks cached in percpu and reset cached information.
2215  */
2216 static void drain_stock(struct memcg_stock_pcp *stock)
2217 {
2218         struct mem_cgroup *old = stock->cached;
2219
2220         if (!old)
2221                 return;
2222
2223         if (stock->nr_pages) {
2224                 page_counter_uncharge(&old->memory, stock->nr_pages);
2225                 if (do_memsw_account())
2226                         page_counter_uncharge(&old->memsw, stock->nr_pages);
2227                 stock->nr_pages = 0;
2228         }
2229
2230         css_put(&old->css);
2231         stock->cached = NULL;
2232 }
2233
2234 static void drain_local_stock(struct work_struct *dummy)
2235 {
2236         struct memcg_stock_pcp *stock;
2237         struct obj_cgroup *old = NULL;
2238         unsigned long flags;
2239
2240         /*
2241          * The only protection from cpu hotplug (memcg_hotplug_cpu_dead) vs.
2242          * drain_stock races is that we always operate on local CPU stock
2243          * here with IRQ disabled
2244          */
2245         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2246
2247         stock = this_cpu_ptr(&memcg_stock);
2248         old = drain_obj_stock(stock);
2249         drain_stock(stock);
2250         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2251
2252         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2253         if (old)
2254                 obj_cgroup_put(old);
2255 }
2256
2257 /*
2258  * Cache charges(val) to local per_cpu area.
2259  * This will be consumed by consume_stock() function, later.
2260  */
2261 static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2262 {
2263         struct memcg_stock_pcp *stock;
2264
2265         stock = this_cpu_ptr(&memcg_stock);
2266         if (stock->cached != memcg) { /* reset if necessary */
2267                 drain_stock(stock);
2268                 css_get(&memcg->css);
2269                 stock->cached = memcg;
2270         }
2271         stock->nr_pages += nr_pages;
2272
2273         if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2274                 drain_stock(stock);
2275 }
2276
2277 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2278 {
2279         unsigned long flags;
2280
2281         local_lock_irqsave(&memcg_stock.stock_lock, flags);
2282         __refill_stock(memcg, nr_pages);
2283         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
2284 }
2285
2286 /*
2287  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2288  * of the hierarchy under it.
2289  */
2290 static void drain_all_stock(struct mem_cgroup *root_memcg)
2291 {
2292         int cpu, curcpu;
2293
2294         /* If someone's already draining, avoid adding running more workers. */
2295         if (!mutex_trylock(&percpu_charge_mutex))
2296                 return;
2297         /*
2298          * Notify other cpus that system-wide "drain" is running
2299          * We do not care about races with the cpu hotplug because cpu down
2300          * as well as workers from this path always operate on the local
2301          * per-cpu data. CPU up doesn't touch memcg_stock at all.
2302          */
2303         migrate_disable();
2304         curcpu = smp_processor_id();
2305         for_each_online_cpu(cpu) {
2306                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2307                 struct mem_cgroup *memcg;
2308                 bool flush = false;
2309
2310                 rcu_read_lock();
2311                 memcg = stock->cached;
2312                 if (memcg && stock->nr_pages &&
2313                     mem_cgroup_is_descendant(memcg, root_memcg))
2314                         flush = true;
2315                 else if (obj_stock_flush_required(stock, root_memcg))
2316                         flush = true;
2317                 rcu_read_unlock();
2318
2319                 if (flush &&
2320                     !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2321                         if (cpu == curcpu)
2322                                 drain_local_stock(&stock->work);
2323                         else
2324                                 schedule_work_on(cpu, &stock->work);
2325                 }
2326         }
2327         migrate_enable();
2328         mutex_unlock(&percpu_charge_mutex);
2329 }
2330
2331 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2332 {
2333         struct memcg_stock_pcp *stock;
2334
2335         stock = &per_cpu(memcg_stock, cpu);
2336         drain_stock(stock);
2337
2338         return 0;
2339 }
2340
2341 static unsigned long reclaim_high(struct mem_cgroup *memcg,
2342                                   unsigned int nr_pages,
2343                                   gfp_t gfp_mask)
2344 {
2345         unsigned long nr_reclaimed = 0;
2346
2347         do {
2348                 unsigned long pflags;
2349
2350                 if (page_counter_read(&memcg->memory) <=
2351                     READ_ONCE(memcg->memory.high))
2352                         continue;
2353
2354                 memcg_memory_event(memcg, MEMCG_HIGH);
2355
2356                 psi_memstall_enter(&pflags);
2357                 nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
2358                                                              gfp_mask, true);
2359                 psi_memstall_leave(&pflags);
2360         } while ((memcg = parent_mem_cgroup(memcg)) &&
2361                  !mem_cgroup_is_root(memcg));
2362
2363         return nr_reclaimed;
2364 }
2365
2366 static void high_work_func(struct work_struct *work)
2367 {
2368         struct mem_cgroup *memcg;
2369
2370         memcg = container_of(work, struct mem_cgroup, high_work);
2371         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2372 }
2373
2374 /*
2375  * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
2376  * enough to still cause a significant slowdown in most cases, while still
2377  * allowing diagnostics and tracing to proceed without becoming stuck.
2378  */
2379 #define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
2380
2381 /*
2382  * When calculating the delay, we use these either side of the exponentiation to
2383  * maintain precision and scale to a reasonable number of jiffies (see the table
2384  * below.
2385  *
2386  * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
2387  *   overage ratio to a delay.
2388  * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
2389  *   proposed penalty in order to reduce to a reasonable number of jiffies, and
2390  *   to produce a reasonable delay curve.
2391  *
2392  * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
2393  * reasonable delay curve compared to precision-adjusted overage, not
2394  * penalising heavily at first, but still making sure that growth beyond the
2395  * limit penalises misbehaviour cgroups by slowing them down exponentially. For
2396  * example, with a high of 100 megabytes:
2397  *
2398  *  +-------+------------------------+
2399  *  | usage | time to allocate in ms |
2400  *  +-------+------------------------+
2401  *  | 100M  |                      0 |
2402  *  | 101M  |                      6 |
2403  *  | 102M  |                     25 |
2404  *  | 103M  |                     57 |
2405  *  | 104M  |                    102 |
2406  *  | 105M  |                    159 |
2407  *  | 106M  |                    230 |
2408  *  | 107M  |                    313 |
2409  *  | 108M  |                    409 |
2410  *  | 109M  |                    518 |
2411  *  | 110M  |                    639 |
2412  *  | 111M  |                    774 |
2413  *  | 112M  |                    921 |
2414  *  | 113M  |                   1081 |
2415  *  | 114M  |                   1254 |
2416  *  | 115M  |                   1439 |
2417  *  | 116M  |                   1638 |
2418  *  | 117M  |                   1849 |
2419  *  | 118M  |                   2000 |
2420  *  | 119M  |                   2000 |
2421  *  | 120M  |                   2000 |
2422  *  +-------+------------------------+
2423  */
2424  #define MEMCG_DELAY_PRECISION_SHIFT 20
2425  #define MEMCG_DELAY_SCALING_SHIFT 14
2426
2427 static u64 calculate_overage(unsigned long usage, unsigned long high)
2428 {
2429         u64 overage;
2430
2431         if (usage <= high)
2432                 return 0;
2433
2434         /*
2435          * Prevent division by 0 in overage calculation by acting as if
2436          * it was a threshold of 1 page
2437          */
2438         high = max(high, 1UL);
2439
2440         overage = usage - high;
2441         overage <<= MEMCG_DELAY_PRECISION_SHIFT;
2442         return div64_u64(overage, high);
2443 }
2444
2445 static u64 mem_find_max_overage(struct mem_cgroup *memcg)
2446 {
2447         u64 overage, max_overage = 0;
2448
2449         do {
2450                 overage = calculate_overage(page_counter_read(&memcg->memory),
2451                                             READ_ONCE(memcg->memory.high));
2452                 max_overage = max(overage, max_overage);
2453         } while ((memcg = parent_mem_cgroup(memcg)) &&
2454                  !mem_cgroup_is_root(memcg));
2455
2456         return max_overage;
2457 }
2458
2459 static u64 swap_find_max_overage(struct mem_cgroup *memcg)
2460 {
2461         u64 overage, max_overage = 0;
2462
2463         do {
2464                 overage = calculate_overage(page_counter_read(&memcg->swap),
2465                                             READ_ONCE(memcg->swap.high));
2466                 if (overage)
2467                         memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
2468                 max_overage = max(overage, max_overage);
2469         } while ((memcg = parent_mem_cgroup(memcg)) &&
2470                  !mem_cgroup_is_root(memcg));
2471
2472         return max_overage;
2473 }
2474
2475 /*
2476  * Get the number of jiffies that we should penalise a mischievous cgroup which
2477  * is exceeding its memory.high by checking both it and its ancestors.
2478  */
2479 static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
2480                                           unsigned int nr_pages,
2481                                           u64 max_overage)
2482 {
2483         unsigned long penalty_jiffies;
2484
2485         if (!max_overage)
2486                 return 0;
2487
2488         /*
2489          * We use overage compared to memory.high to calculate the number of
2490          * jiffies to sleep (penalty_jiffies). Ideally this value should be
2491          * fairly lenient on small overages, and increasingly harsh when the
2492          * memcg in question makes it clear that it has no intention of stopping
2493          * its crazy behaviour, so we exponentially increase the delay based on
2494          * overage amount.
2495          */
2496         penalty_jiffies = max_overage * max_overage * HZ;
2497         penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
2498         penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
2499
2500         /*
2501          * Factor in the task's own contribution to the overage, such that four
2502          * N-sized allocations are throttled approximately the same as one
2503          * 4N-sized allocation.
2504          *
2505          * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
2506          * larger the current charge patch is than that.
2507          */
2508         return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
2509 }
2510
2511 /*
2512  * Scheduled by try_charge() to be executed from the userland return path
2513  * and reclaims memory over the high limit.
2514  */
2515 void mem_cgroup_handle_over_high(void)
2516 {
2517         unsigned long penalty_jiffies;
2518         unsigned long pflags;
2519         unsigned long nr_reclaimed;
2520         unsigned int nr_pages = current->memcg_nr_pages_over_high;
2521         int nr_retries = MAX_RECLAIM_RETRIES;
2522         struct mem_cgroup *memcg;
2523         bool in_retry = false;
2524
2525         if (likely(!nr_pages))
2526                 return;
2527
2528         memcg = get_mem_cgroup_from_mm(current->mm);
2529         current->memcg_nr_pages_over_high = 0;
2530
2531 retry_reclaim:
2532         /*
2533          * The allocating task should reclaim at least the batch size, but for
2534          * subsequent retries we only want to do what's necessary to prevent oom
2535          * or breaching resource isolation.
2536          *
2537          * This is distinct from memory.max or page allocator behaviour because
2538          * memory.high is currently batched, whereas memory.max and the page
2539          * allocator run every time an allocation is made.
2540          */
2541         nr_reclaimed = reclaim_high(memcg,
2542                                     in_retry ? SWAP_CLUSTER_MAX : nr_pages,
2543                                     GFP_KERNEL);
2544
2545         /*
2546          * memory.high is breached and reclaim is unable to keep up. Throttle
2547          * allocators proactively to slow down excessive growth.
2548          */
2549         penalty_jiffies = calculate_high_delay(memcg, nr_pages,
2550                                                mem_find_max_overage(memcg));
2551
2552         penalty_jiffies += calculate_high_delay(memcg, nr_pages,
2553                                                 swap_find_max_overage(memcg));
2554
2555         /*
2556          * Clamp the max delay per usermode return so as to still keep the
2557          * application moving forwards and also permit diagnostics, albeit
2558          * extremely slowly.
2559          */
2560         penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
2561
2562         /*
2563          * Don't sleep if the amount of jiffies this memcg owes us is so low
2564          * that it's not even worth doing, in an attempt to be nice to those who
2565          * go only a small amount over their memory.high value and maybe haven't
2566          * been aggressively reclaimed enough yet.
2567          */
2568         if (penalty_jiffies <= HZ / 100)
2569                 goto out;
2570
2571         /*
2572          * If reclaim is making forward progress but we're still over
2573          * memory.high, we want to encourage that rather than doing allocator
2574          * throttling.
2575          */
2576         if (nr_reclaimed || nr_retries--) {
2577                 in_retry = true;
2578                 goto retry_reclaim;
2579         }
2580
2581         /*
2582          * If we exit early, we're guaranteed to die (since
2583          * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
2584          * need to account for any ill-begotten jiffies to pay them off later.
2585          */
2586         psi_memstall_enter(&pflags);
2587         schedule_timeout_killable(penalty_jiffies);
2588         psi_memstall_leave(&pflags);
2589
2590 out:
2591         css_put(&memcg->css);
2592 }
2593
2594 static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
2595                         unsigned int nr_pages)
2596 {
2597         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2598         int nr_retries = MAX_RECLAIM_RETRIES;
2599         struct mem_cgroup *mem_over_limit;
2600         struct page_counter *counter;
2601         unsigned long nr_reclaimed;
2602         bool passed_oom = false;
2603         bool may_swap = true;
2604         bool drained = false;
2605         unsigned long pflags;
2606
2607 retry:
2608         if (consume_stock(memcg, nr_pages))
2609                 return 0;
2610
2611         if (!do_memsw_account() ||
2612             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2613                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2614                         goto done_restock;
2615                 if (do_memsw_account())
2616                         page_counter_uncharge(&memcg->memsw, batch);
2617                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2618         } else {
2619                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2620                 may_swap = false;
2621         }
2622
2623         if (batch > nr_pages) {
2624                 batch = nr_pages;
2625                 goto retry;
2626         }
2627
2628         /*
2629          * Prevent unbounded recursion when reclaim operations need to
2630          * allocate memory. This might exceed the limits temporarily,
2631          * but we prefer facilitating memory reclaim and getting back
2632          * under the limit over triggering OOM kills in these cases.
2633          */
2634         if (unlikely(current->flags & PF_MEMALLOC))
2635                 goto force;
2636
2637         if (unlikely(task_in_memcg_oom(current)))
2638                 goto nomem;
2639
2640         if (!gfpflags_allow_blocking(gfp_mask))
2641                 goto nomem;
2642
2643         memcg_memory_event(mem_over_limit, MEMCG_MAX);
2644
2645         psi_memstall_enter(&pflags);
2646         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2647                                                     gfp_mask, may_swap);
2648         psi_memstall_leave(&pflags);
2649
2650         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2651                 goto retry;
2652
2653         if (!drained) {
2654                 drain_all_stock(mem_over_limit);
2655                 drained = true;
2656                 goto retry;
2657         }
2658
2659         if (gfp_mask & __GFP_NORETRY)
2660                 goto nomem;
2661         /*
2662          * Even though the limit is exceeded at this point, reclaim
2663          * may have been able to free some pages.  Retry the charge
2664          * before killing the task.
2665          *
2666          * Only for regular pages, though: huge pages are rather
2667          * unlikely to succeed so close to the limit, and we fall back
2668          * to regular pages anyway in case of failure.
2669          */
2670         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2671                 goto retry;
2672         /*
2673          * At task move, charge accounts can be doubly counted. So, it's
2674          * better to wait until the end of task_move if something is going on.
2675          */
2676         if (mem_cgroup_wait_acct_move(mem_over_limit))
2677                 goto retry;
2678
2679         if (nr_retries--)
2680                 goto retry;
2681
2682         if (gfp_mask & __GFP_RETRY_MAYFAIL)
2683                 goto nomem;
2684
2685         /* Avoid endless loop for tasks bypassed by the oom killer */
2686         if (passed_oom && task_is_dying())
2687                 goto nomem;
2688
2689         /*
2690          * keep retrying as long as the memcg oom killer is able to make
2691          * a forward progress or bypass the charge if the oom killer
2692          * couldn't make any progress.
2693          */
2694         if (mem_cgroup_oom(mem_over_limit, gfp_mask,
2695                            get_order(nr_pages * PAGE_SIZE))) {
2696                 passed_oom = true;
2697                 nr_retries = MAX_RECLAIM_RETRIES;
2698                 goto retry;
2699         }
2700 nomem:
2701         /*
2702          * Memcg doesn't have a dedicated reserve for atomic
2703          * allocations. But like the global atomic pool, we need to
2704          * put the burden of reclaim on regular allocation requests
2705          * and let these go through as privileged allocations.
2706          */
2707         if (!(gfp_mask & (__GFP_NOFAIL | __GFP_HIGH)))
2708                 return -ENOMEM;
2709 force:
2710         /*
2711          * The allocation either can't fail or will lead to more memory
2712          * being freed very soon.  Allow memory usage go over the limit
2713          * temporarily by force charging it.
2714          */
2715         page_counter_charge(&memcg->memory, nr_pages);
2716         if (do_memsw_account())
2717                 page_counter_charge(&memcg->memsw, nr_pages);
2718
2719         return 0;
2720
2721 done_restock:
2722         if (batch > nr_pages)
2723                 refill_stock(memcg, batch - nr_pages);
2724
2725         /*
2726          * If the hierarchy is above the normal consumption range, schedule
2727          * reclaim on returning to userland.  We can perform reclaim here
2728          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2729          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2730          * not recorded as it most likely matches current's and won't
2731          * change in the meantime.  As high limit is checked again before
2732          * reclaim, the cost of mismatch is negligible.
2733          */
2734         do {
2735                 bool mem_high, swap_high;
2736
2737                 mem_high = page_counter_read(&memcg->memory) >
2738                         READ_ONCE(memcg->memory.high);
2739                 swap_high = page_counter_read(&memcg->swap) >
2740                         READ_ONCE(memcg->swap.high);
2741
2742                 /* Don't bother a random interrupted task */
2743                 if (!in_task()) {
2744                         if (mem_high) {
2745                                 schedule_work(&memcg->high_work);
2746                                 break;
2747                         }
2748                         continue;
2749                 }
2750
2751                 if (mem_high || swap_high) {
2752                         /*
2753                          * The allocating tasks in this cgroup will need to do
2754                          * reclaim or be throttled to prevent further growth
2755                          * of the memory or swap footprints.
2756                          *
2757                          * Target some best-effort fairness between the tasks,
2758                          * and distribute reclaim work and delay penalties
2759                          * based on how much each task is actually allocating.
2760                          */
2761                         current->memcg_nr_pages_over_high += batch;
2762                         set_notify_resume(current);
2763                         break;
2764                 }
2765         } while ((memcg = parent_mem_cgroup(memcg)));
2766
2767         if (current->memcg_nr_pages_over_high > MEMCG_CHARGE_BATCH &&
2768             !(current->flags & PF_MEMALLOC) &&
2769             gfpflags_allow_blocking(gfp_mask)) {
2770                 mem_cgroup_handle_over_high();
2771         }
2772         return 0;
2773 }
2774
2775 static inline int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2776                              unsigned int nr_pages)
2777 {
2778         if (mem_cgroup_is_root(memcg))
2779                 return 0;
2780
2781         return try_charge_memcg(memcg, gfp_mask, nr_pages);
2782 }
2783
2784 static inline void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2785 {
2786         if (mem_cgroup_is_root(memcg))
2787                 return;
2788
2789         page_counter_uncharge(&memcg->memory, nr_pages);
2790         if (do_memsw_account())
2791                 page_counter_uncharge(&memcg->memsw, nr_pages);
2792 }
2793
2794 static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
2795 {
2796         VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
2797         /*
2798          * Any of the following ensures page's memcg stability:
2799          *
2800          * - the page lock
2801          * - LRU isolation
2802          * - lock_page_memcg()
2803          * - exclusive reference
2804          */
2805         folio->memcg_data = (unsigned long)memcg;
2806 }
2807
2808 #ifdef CONFIG_MEMCG_KMEM
2809 /*
2810  * The allocated objcg pointers array is not accounted directly.
2811  * Moreover, it should not come from DMA buffer and is not readily
2812  * reclaimable. So those GFP bits should be masked off.
2813  */
2814 #define OBJCGS_CLEAR_MASK       (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
2815
2816 /*
2817  * mod_objcg_mlstate() may be called with irq enabled, so
2818  * mod_memcg_lruvec_state() should be used.
2819  */
2820 static inline void mod_objcg_mlstate(struct obj_cgroup *objcg,
2821                                      struct pglist_data *pgdat,
2822                                      enum node_stat_item idx, int nr)
2823 {
2824         struct mem_cgroup *memcg;
2825         struct lruvec *lruvec;
2826
2827         rcu_read_lock();
2828         memcg = obj_cgroup_memcg(objcg);
2829         lruvec = mem_cgroup_lruvec(memcg, pgdat);
2830         mod_memcg_lruvec_state(lruvec, idx, nr);
2831         rcu_read_unlock();
2832 }
2833
2834 int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
2835                                  gfp_t gfp, bool new_slab)
2836 {
2837         unsigned int objects = objs_per_slab(s, slab);
2838         unsigned long memcg_data;
2839         void *vec;
2840
2841         gfp &= ~OBJCGS_CLEAR_MASK;
2842         vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
2843                            slab_nid(slab));
2844         if (!vec)
2845                 return -ENOMEM;
2846
2847         memcg_data = (unsigned long) vec | MEMCG_DATA_OBJCGS;
2848         if (new_slab) {
2849                 /*
2850                  * If the slab is brand new and nobody can yet access its
2851                  * memcg_data, no synchronization is required and memcg_data can
2852                  * be simply assigned.
2853                  */
2854                 slab->memcg_data = memcg_data;
2855         } else if (cmpxchg(&slab->memcg_data, 0, memcg_data)) {
2856                 /*
2857                  * If the slab is already in use, somebody can allocate and
2858                  * assign obj_cgroups in parallel. In this case the existing
2859                  * objcg vector should be reused.
2860                  */
2861                 kfree(vec);
2862                 return 0;
2863         }
2864
2865         kmemleak_not_leak(vec);
2866         return 0;
2867 }
2868
2869 /*
2870  * Returns a pointer to the memory cgroup to which the kernel object is charged.
2871  *
2872  * A passed kernel object can be a slab object or a generic kernel page, so
2873  * different mechanisms for getting the memory cgroup pointer should be used.
2874  * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
2875  * can not know for sure how the kernel object is implemented.
2876  * mem_cgroup_from_obj() can be safely used in such cases.
2877  *
2878  * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
2879  * cgroup_mutex, etc.
2880  */
2881 struct mem_cgroup *mem_cgroup_from_obj(void *p)
2882 {
2883         struct folio *folio;
2884
2885         if (mem_cgroup_disabled())
2886                 return NULL;
2887
2888         folio = virt_to_folio(p);
2889
2890         /*
2891          * Slab objects are accounted individually, not per-page.
2892          * Memcg membership data for each individual object is saved in
2893          * slab->memcg_data.
2894          */
2895         if (folio_test_slab(folio)) {
2896                 struct obj_cgroup **objcgs;
2897                 struct slab *slab;
2898                 unsigned int off;
2899
2900                 slab = folio_slab(folio);
2901                 objcgs = slab_objcgs(slab);
2902                 if (!objcgs)
2903                         return NULL;
2904
2905                 off = obj_to_index(slab->slab_cache, slab, p);
2906                 if (objcgs[off])
2907                         return obj_cgroup_memcg(objcgs[off]);
2908
2909                 return NULL;
2910         }
2911
2912         /*
2913          * page_memcg_check() is used here, because in theory we can encounter
2914          * a folio where the slab flag has been cleared already, but
2915          * slab->memcg_data has not been freed yet
2916          * page_memcg_check(page) will guarantee that a proper memory
2917          * cgroup pointer or NULL will be returned.
2918          */
2919         return page_memcg_check(folio_page(folio, 0));
2920 }
2921
2922 __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
2923 {
2924         struct obj_cgroup *objcg = NULL;
2925         struct mem_cgroup *memcg;
2926
2927         if (memcg_kmem_bypass())
2928                 return NULL;
2929
2930         rcu_read_lock();
2931         if (unlikely(active_memcg()))
2932                 memcg = active_memcg();
2933         else
2934                 memcg = mem_cgroup_from_task(current);
2935
2936         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
2937                 objcg = rcu_dereference(memcg->objcg);
2938                 if (objcg && obj_cgroup_tryget(objcg))
2939                         break;
2940                 objcg = NULL;
2941         }
2942         rcu_read_unlock();
2943
2944         return objcg;
2945 }
2946
2947 static int memcg_alloc_cache_id(void)
2948 {
2949         int id, size;
2950         int err;
2951
2952         id = ida_simple_get(&memcg_cache_ida,
2953                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2954         if (id < 0)
2955                 return id;
2956
2957         if (id < memcg_nr_cache_ids)
2958                 return id;
2959
2960         /*
2961          * There's no space for the new id in memcg_caches arrays,
2962          * so we have to grow them.
2963          */
2964         down_write(&memcg_cache_ids_sem);
2965
2966         size = 2 * (id + 1);
2967         if (size < MEMCG_CACHES_MIN_SIZE)
2968                 size = MEMCG_CACHES_MIN_SIZE;
2969         else if (size > MEMCG_CACHES_MAX_SIZE)
2970                 size = MEMCG_CACHES_MAX_SIZE;
2971
2972         err = memcg_update_all_list_lrus(size);
2973         if (!err)
2974                 memcg_nr_cache_ids = size;
2975
2976         up_write(&memcg_cache_ids_sem);
2977
2978         if (err) {
2979                 ida_simple_remove(&memcg_cache_ida, id);
2980                 return err;
2981         }
2982         return id;
2983 }
2984
2985 static void memcg_free_cache_id(int id)
2986 {
2987         ida_simple_remove(&memcg_cache_ida, id);
2988 }
2989
2990 static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages)
2991 {
2992         mod_memcg_state(memcg, MEMCG_KMEM, nr_pages);
2993         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
2994                 if (nr_pages > 0)
2995                         page_counter_charge(&memcg->kmem, nr_pages);
2996                 else
2997                         page_counter_uncharge(&memcg->kmem, -nr_pages);
2998         }
2999 }
3000
3001
3002 /*
3003  * obj_cgroup_uncharge_pages: uncharge a number of kernel pages from a objcg
3004  * @objcg: object cgroup to uncharge
3005  * @nr_pages: number of pages to uncharge
3006  */
3007 static void obj_cgroup_uncharge_pages(struct obj_cgroup *objcg,
3008                                       unsigned int nr_pages)
3009 {
3010         struct mem_cgroup *memcg;
3011
3012         memcg = get_mem_cgroup_from_objcg(objcg);
3013
3014         memcg_account_kmem(memcg, -nr_pages);
3015         refill_stock(memcg, nr_pages);
3016
3017         css_put(&memcg->css);
3018 }
3019
3020 /*
3021  * obj_cgroup_charge_pages: charge a number of kernel pages to a objcg
3022  * @objcg: object cgroup to charge
3023  * @gfp: reclaim mode
3024  * @nr_pages: number of pages to charge
3025  *
3026  * Returns 0 on success, an error code on failure.
3027  */
3028 static int obj_cgroup_charge_pages(struct obj_cgroup *objcg, gfp_t gfp,
3029                                    unsigned int nr_pages)
3030 {
3031         struct mem_cgroup *memcg;
3032         int ret;
3033
3034         memcg = get_mem_cgroup_from_objcg(objcg);
3035
3036         ret = try_charge_memcg(memcg, gfp, nr_pages);
3037         if (ret)
3038                 goto out;
3039
3040         memcg_account_kmem(memcg, nr_pages);
3041 out:
3042         css_put(&memcg->css);
3043
3044         return ret;
3045 }
3046
3047 /**
3048  * __memcg_kmem_charge_page: charge a kmem page to the current memory cgroup
3049  * @page: page to charge
3050  * @gfp: reclaim mode
3051  * @order: allocation order
3052  *
3053  * Returns 0 on success, an error code on failure.
3054  */
3055 int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
3056 {
3057         struct obj_cgroup *objcg;
3058         int ret = 0;
3059
3060         objcg = get_obj_cgroup_from_current();
3061         if (objcg) {
3062                 ret = obj_cgroup_charge_pages(objcg, gfp, 1 << order);
3063                 if (!ret) {
3064                         page->memcg_data = (unsigned long)objcg |
3065                                 MEMCG_DATA_KMEM;
3066                         return 0;
3067                 }
3068                 obj_cgroup_put(objcg);
3069         }
3070         return ret;
3071 }
3072
3073 /**
3074  * __memcg_kmem_uncharge_page: uncharge a kmem page
3075  * @page: page to uncharge
3076  * @order: allocation order
3077  */
3078 void __memcg_kmem_uncharge_page(struct page *page, int order)
3079 {
3080         struct folio *folio = page_folio(page);
3081         struct obj_cgroup *objcg;
3082         unsigned int nr_pages = 1 << order;
3083
3084         if (!folio_memcg_kmem(folio))
3085                 return;
3086
3087         objcg = __folio_objcg(folio);
3088         obj_cgroup_uncharge_pages(objcg, nr_pages);
3089         folio->memcg_data = 0;
3090         obj_cgroup_put(objcg);
3091 }
3092
3093 void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
3094                      enum node_stat_item idx, int nr)
3095 {
3096         struct memcg_stock_pcp *stock;
3097         struct obj_cgroup *old = NULL;
3098         unsigned long flags;
3099         int *bytes;
3100
3101         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3102         stock = this_cpu_ptr(&memcg_stock);
3103
3104         /*
3105          * Save vmstat data in stock and skip vmstat array update unless
3106          * accumulating over a page of vmstat data or when pgdat or idx
3107          * changes.
3108          */
3109         if (stock->cached_objcg != objcg) {
3110                 old = drain_obj_stock(stock);
3111                 obj_cgroup_get(objcg);
3112                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3113                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3114                 stock->cached_objcg = objcg;
3115                 stock->cached_pgdat = pgdat;
3116         } else if (stock->cached_pgdat != pgdat) {
3117                 /* Flush the existing cached vmstat data */
3118                 struct pglist_data *oldpg = stock->cached_pgdat;
3119
3120                 if (stock->nr_slab_reclaimable_b) {
3121                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_RECLAIMABLE_B,
3122                                           stock->nr_slab_reclaimable_b);
3123                         stock->nr_slab_reclaimable_b = 0;
3124                 }
3125                 if (stock->nr_slab_unreclaimable_b) {
3126                         mod_objcg_mlstate(objcg, oldpg, NR_SLAB_UNRECLAIMABLE_B,
3127                                           stock->nr_slab_unreclaimable_b);
3128                         stock->nr_slab_unreclaimable_b = 0;
3129                 }
3130                 stock->cached_pgdat = pgdat;
3131         }
3132
3133         bytes = (idx == NR_SLAB_RECLAIMABLE_B) ? &stock->nr_slab_reclaimable_b
3134                                                : &stock->nr_slab_unreclaimable_b;
3135         /*
3136          * Even for large object >= PAGE_SIZE, the vmstat data will still be
3137          * cached locally at least once before pushing it out.
3138          */
3139         if (!*bytes) {
3140                 *bytes = nr;
3141                 nr = 0;
3142         } else {
3143                 *bytes += nr;
3144                 if (abs(*bytes) > PAGE_SIZE) {
3145                         nr = *bytes;
3146                         *bytes = 0;
3147                 } else {
3148                         nr = 0;
3149                 }
3150         }
3151         if (nr)
3152                 mod_objcg_mlstate(objcg, pgdat, idx, nr);
3153
3154         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3155         if (old)
3156                 obj_cgroup_put(old);
3157 }
3158
3159 static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
3160 {
3161         struct memcg_stock_pcp *stock;
3162         unsigned long flags;
3163         bool ret = false;
3164
3165         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3166
3167         stock = this_cpu_ptr(&memcg_stock);
3168         if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
3169                 stock->nr_bytes -= nr_bytes;
3170                 ret = true;
3171         }
3172
3173         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3174
3175         return ret;
3176 }
3177
3178 static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
3179 {
3180         struct obj_cgroup *old = stock->cached_objcg;
3181
3182         if (!old)
3183                 return NULL;
3184
3185         if (stock->nr_bytes) {
3186                 unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3187                 unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
3188
3189                 if (nr_pages) {
3190                         struct mem_cgroup *memcg;
3191
3192                         memcg = get_mem_cgroup_from_objcg(old);
3193
3194                         memcg_account_kmem(memcg, -nr_pages);
3195                         __refill_stock(memcg, nr_pages);
3196
3197                         css_put(&memcg->css);
3198                 }
3199
3200                 /*
3201                  * The leftover is flushed to the centralized per-memcg value.
3202                  * On the next attempt to refill obj stock it will be moved
3203                  * to a per-cpu stock (probably, on an other CPU), see
3204                  * refill_obj_stock().
3205                  *
3206                  * How often it's flushed is a trade-off between the memory
3207                  * limit enforcement accuracy and potential CPU contention,
3208                  * so it might be changed in the future.
3209                  */
3210                 atomic_add(nr_bytes, &old->nr_charged_bytes);
3211                 stock->nr_bytes = 0;
3212         }
3213
3214         /*
3215          * Flush the vmstat data in current stock
3216          */
3217         if (stock->nr_slab_reclaimable_b || stock->nr_slab_unreclaimable_b) {
3218                 if (stock->nr_slab_reclaimable_b) {
3219                         mod_objcg_mlstate(old, stock->cached_pgdat,
3220                                           NR_SLAB_RECLAIMABLE_B,
3221                                           stock->nr_slab_reclaimable_b);
3222                         stock->nr_slab_reclaimable_b = 0;
3223                 }
3224                 if (stock->nr_slab_unreclaimable_b) {
3225                         mod_objcg_mlstate(old, stock->cached_pgdat,
3226                                           NR_SLAB_UNRECLAIMABLE_B,
3227                                           stock->nr_slab_unreclaimable_b);
3228                         stock->nr_slab_unreclaimable_b = 0;
3229                 }
3230                 stock->cached_pgdat = NULL;
3231         }
3232
3233         stock->cached_objcg = NULL;
3234         /*
3235          * The `old' objects needs to be released by the caller via
3236          * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
3237          */
3238         return old;
3239 }
3240
3241 static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
3242                                      struct mem_cgroup *root_memcg)
3243 {
3244         struct mem_cgroup *memcg;
3245
3246         if (stock->cached_objcg) {
3247                 memcg = obj_cgroup_memcg(stock->cached_objcg);
3248                 if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
3249                         return true;
3250         }
3251
3252         return false;
3253 }
3254
3255 static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
3256                              bool allow_uncharge)
3257 {
3258         struct memcg_stock_pcp *stock;
3259         struct obj_cgroup *old = NULL;
3260         unsigned long flags;
3261         unsigned int nr_pages = 0;
3262
3263         local_lock_irqsave(&memcg_stock.stock_lock, flags);
3264
3265         stock = this_cpu_ptr(&memcg_stock);
3266         if (stock->cached_objcg != objcg) { /* reset if necessary */
3267                 old = drain_obj_stock(stock);
3268                 obj_cgroup_get(objcg);
3269                 stock->cached_objcg = objcg;
3270                 stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
3271                                 ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
3272                 allow_uncharge = true;  /* Allow uncharge when objcg changes */
3273         }
3274         stock->nr_bytes += nr_bytes;
3275
3276         if (allow_uncharge && (stock->nr_bytes > PAGE_SIZE)) {
3277                 nr_pages = stock->nr_bytes >> PAGE_SHIFT;
3278                 stock->nr_bytes &= (PAGE_SIZE - 1);
3279         }
3280
3281         local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
3282         if (old)
3283                 obj_cgroup_put(old);
3284
3285         if (nr_pages)
3286                 obj_cgroup_uncharge_pages(objcg, nr_pages);
3287 }
3288
3289 int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
3290 {
3291         unsigned int nr_pages, nr_bytes;
3292         int ret;
3293
3294         if (consume_obj_stock(objcg, size))
3295                 return 0;
3296
3297         /*
3298          * In theory, objcg->nr_charged_bytes can have enough
3299          * pre-charged bytes to satisfy the allocation. However,
3300          * flushing objcg->nr_charged_bytes requires two atomic
3301          * operations, and objcg->nr_charged_bytes can't be big.
3302          * The shared objcg->nr_charged_bytes can also become a
3303          * performance bottleneck if all tasks of the same memcg are
3304          * trying to update it. So it's better to ignore it and try
3305          * grab some new pages. The stock's nr_bytes will be flushed to
3306          * objcg->nr_charged_bytes later on when objcg changes.
3307          *
3308          * The stock's nr_bytes may contain enough pre-charged bytes
3309          * to allow one less page from being charged, but we can't rely
3310          * on the pre-charged bytes not being changed outside of
3311          * consume_obj_stock() or refill_obj_stock(). So ignore those
3312          * pre-charged bytes as well when charging pages. To avoid a
3313          * page uncharge right after a page charge, we set the
3314          * allow_uncharge flag to false when calling refill_obj_stock()
3315          * to temporarily allow the pre-charged bytes to exceed the page
3316          * size limit. The maximum reachable value of the pre-charged
3317          * bytes is (sizeof(object) + PAGE_SIZE - 2) if there is no data
3318          * race.
3319          */
3320         nr_pages = size >> PAGE_SHIFT;
3321         nr_bytes = size & (PAGE_SIZE - 1);
3322
3323         if (nr_bytes)
3324                 nr_pages += 1;
3325
3326         ret = obj_cgroup_charge_pages(objcg, gfp, nr_pages);
3327         if (!ret && nr_bytes)
3328                 refill_obj_stock(objcg, PAGE_SIZE - nr_bytes, false);
3329
3330         return ret;
3331 }
3332
3333 void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
3334 {
3335         refill_obj_stock(objcg, size, true);
3336 }
3337
3338 #endif /* CONFIG_MEMCG_KMEM */
3339
3340 /*
3341  * Because page_memcg(head) is not set on tails, set it now.
3342  */
3343 void split_page_memcg(struct page *head, unsigned int nr)
3344 {
3345         struct folio *folio = page_folio(head);
3346         struct mem_cgroup *memcg = folio_memcg(folio);
3347         int i;
3348
3349         if (mem_cgroup_disabled() || !memcg)
3350                 return;
3351
3352         for (i = 1; i < nr; i++)
3353                 folio_page(folio, i)->memcg_data = folio->memcg_data;
3354
3355         if (folio_memcg_kmem(folio))
3356                 obj_cgroup_get_many(__folio_objcg(folio), nr - 1);
3357         else
3358                 css_get_many(&memcg->css, nr - 1);
3359 }
3360
3361 #ifdef CONFIG_MEMCG_SWAP
3362 /**
3363  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
3364  * @entry: swap entry to be moved
3365  * @from:  mem_cgroup which the entry is moved from
3366  * @to:  mem_cgroup which the entry is moved to
3367  *
3368  * It succeeds only when the swap_cgroup's record for this entry is the same
3369  * as the mem_cgroup's id of @from.
3370  *
3371  * Returns 0 on success, -EINVAL on failure.
3372  *
3373  * The caller must have charged to @to, IOW, called page_counter_charge() about
3374  * both res and memsw, and called css_get().
3375  */
3376 static int mem_cgroup_move_swap_account(swp_entry_t entry,
3377                                 struct mem_cgroup *from, struct mem_cgroup *to)
3378 {
3379         unsigned short old_id, new_id;
3380
3381         old_id = mem_cgroup_id(from);
3382         new_id = mem_cgroup_id(to);
3383
3384         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
3385                 mod_memcg_state(from, MEMCG_SWAP, -1);
3386                 mod_memcg_state(to, MEMCG_SWAP, 1);
3387                 return 0;
3388         }
3389         return -EINVAL;
3390 }
3391 #else
3392 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
3393                                 struct mem_cgroup *from, struct mem_cgroup *to)
3394 {
3395         return -EINVAL;
3396 }
3397 #endif
3398
3399 static DEFINE_MUTEX(memcg_max_mutex);
3400
3401 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
3402                                  unsigned long max, bool memsw)
3403 {
3404         bool enlarge = false;
3405         bool drained = false;
3406         int ret;
3407         bool limits_invariant;
3408         struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
3409
3410         do {
3411                 if (signal_pending(current)) {
3412                         ret = -EINTR;
3413                         break;
3414                 }
3415
3416                 mutex_lock(&memcg_max_mutex);
3417                 /*
3418                  * Make sure that the new limit (memsw or memory limit) doesn't
3419                  * break our basic invariant rule memory.max <= memsw.max.
3420                  */
3421                 limits_invariant = memsw ? max >= READ_ONCE(memcg->memory.max) :
3422                                            max <= memcg->memsw.max;
3423                 if (!limits_invariant) {
3424                         mutex_unlock(&memcg_max_mutex);
3425                         ret = -EINVAL;
3426                         break;
3427                 }
3428                 if (max > counter->max)
3429                         enlarge = true;
3430                 ret = page_counter_set_max(counter, max);
3431                 mutex_unlock(&memcg_max_mutex);
3432
3433                 if (!ret)
3434                         break;
3435
3436                 if (!drained) {
3437                         drain_all_stock(memcg);
3438                         drained = true;
3439                         continue;
3440                 }
3441
3442                 if (!try_to_free_mem_cgroup_pages(memcg, 1,
3443                                         GFP_KERNEL, !memsw)) {
3444                         ret = -EBUSY;
3445                         break;
3446                 }
3447         } while (true);
3448
3449         if (!ret && enlarge)
3450                 memcg_oom_recover(memcg);
3451
3452         return ret;
3453 }
3454
3455 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
3456                                             gfp_t gfp_mask,
3457                                             unsigned long *total_scanned)
3458 {
3459         unsigned long nr_reclaimed = 0;
3460         struct mem_cgroup_per_node *mz, *next_mz = NULL;
3461         unsigned long reclaimed;
3462         int loop = 0;
3463         struct mem_cgroup_tree_per_node *mctz;
3464         unsigned long excess;
3465         unsigned long nr_scanned;
3466
3467         if (order > 0)
3468                 return 0;
3469
3470         mctz = soft_limit_tree.rb_tree_per_node[pgdat->node_id];
3471
3472         /*
3473          * Do not even bother to check the largest node if the root
3474          * is empty. Do it lockless to prevent lock bouncing. Races
3475          * are acceptable as soft limit is best effort anyway.
3476          */
3477         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
3478                 return 0;
3479
3480         /*
3481          * This loop can run a while, specially if mem_cgroup's continuously
3482          * keep exceeding their soft limit and putting the system under
3483          * pressure
3484          */
3485         do {
3486                 if (next_mz)
3487                         mz = next_mz;
3488                 else
3489                         mz = mem_cgroup_largest_soft_limit_node(mctz);
3490                 if (!mz)
3491                         break;
3492
3493                 nr_scanned = 0;
3494                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
3495                                                     gfp_mask, &nr_scanned);
3496                 nr_reclaimed += reclaimed;
3497                 *total_scanned += nr_scanned;
3498                 spin_lock_irq(&mctz->lock);
3499                 __mem_cgroup_remove_exceeded(mz, mctz);
3500
3501                 /*
3502                  * If we failed to reclaim anything from this memory cgroup
3503                  * it is time to move on to the next cgroup
3504                  */
3505                 next_mz = NULL;
3506                 if (!reclaimed)
3507                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
3508
3509                 excess = soft_limit_excess(mz->memcg);
3510                 /*
3511                  * One school of thought says that we should not add
3512                  * back the node to the tree if reclaim returns 0.
3513                  * But our reclaim could return 0, simply because due
3514                  * to priority we are exposing a smaller subset of
3515                  * memory to reclaim from. Consider this as a longer
3516                  * term TODO.
3517                  */
3518                 /* If excess == 0, no tree ops */
3519                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
3520                 spin_unlock_irq(&mctz->lock);
3521                 css_put(&mz->memcg->css);
3522                 loop++;
3523                 /*
3524                  * Could not reclaim anything and there are no more
3525                  * mem cgroups to try or we seem to be looping without
3526                  * reclaiming anything.
3527                  */
3528                 if (!nr_reclaimed &&
3529                         (next_mz == NULL ||
3530                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
3531                         break;
3532         } while (!nr_reclaimed);
3533         if (next_mz)
3534                 css_put(&next_mz->memcg->css);
3535         return nr_reclaimed;
3536 }
3537
3538 /*
3539  * Reclaims as many pages from the given memcg as possible.
3540  *
3541  * Caller is responsible for holding css reference for memcg.
3542  */
3543 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
3544 {
3545         int nr_retries = MAX_RECLAIM_RETRIES;
3546
3547         /* we call try-to-free pages for make this cgroup empty */
3548         lru_add_drain_all();
3549
3550         drain_all_stock(memcg);
3551
3552         /* try to free all pages in this cgroup */
3553         while (nr_retries && page_counter_read(&memcg->memory)) {
3554                 if (signal_pending(current))
3555                         return -EINTR;
3556
3557                 if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true))
3558                         nr_retries--;
3559         }
3560
3561         return 0;
3562 }
3563
3564 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
3565                                             char *buf, size_t nbytes,
3566                                             loff_t off)
3567 {
3568         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3569
3570         if (mem_cgroup_is_root(memcg))
3571                 return -EINVAL;
3572         return mem_cgroup_force_empty(memcg) ?: nbytes;
3573 }
3574
3575 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
3576                                      struct cftype *cft)
3577 {
3578         return 1;
3579 }
3580
3581 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
3582                                       struct cftype *cft, u64 val)
3583 {
3584         if (val == 1)
3585                 return 0;
3586
3587         pr_warn_once("Non-hierarchical mode is deprecated. "
3588                      "Please report your usecase to linux-mm@kvack.org if you "
3589                      "depend on this functionality.\n");
3590
3591         return -EINVAL;
3592 }
3593
3594 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
3595 {
3596         unsigned long val;
3597
3598         if (mem_cgroup_is_root(memcg)) {
3599                 mem_cgroup_flush_stats();
3600                 val = memcg_page_state(memcg, NR_FILE_PAGES) +
3601                         memcg_page_state(memcg, NR_ANON_MAPPED);
3602                 if (swap)
3603                         val += memcg_page_state(memcg, MEMCG_SWAP);
3604         } else {
3605                 if (!swap)
3606                         val = page_counter_read(&memcg->memory);
3607                 else
3608                         val = page_counter_read(&memcg->memsw);
3609         }
3610         return val;
3611 }
3612
3613 enum {
3614         RES_USAGE,
3615         RES_LIMIT,
3616         RES_MAX_USAGE,
3617         RES_FAILCNT,
3618         RES_SOFT_LIMIT,
3619 };
3620
3621 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3622                                struct cftype *cft)
3623 {
3624         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3625         struct page_counter *counter;
3626
3627         switch (MEMFILE_TYPE(cft->private)) {
3628         case _MEM:
3629                 counter = &memcg->memory;
3630                 break;
3631         case _MEMSWAP:
3632                 counter = &memcg->memsw;
3633                 break;
3634         case _KMEM:
3635                 counter = &memcg->kmem;
3636                 break;
3637         case _TCP:
3638                 counter = &memcg->tcpmem;
3639                 break;
3640         default:
3641                 BUG();
3642         }
3643
3644         switch (MEMFILE_ATTR(cft->private)) {
3645         case RES_USAGE:
3646                 if (counter == &memcg->memory)
3647                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3648                 if (counter == &memcg->memsw)
3649                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3650                 return (u64)page_counter_read(counter) * PAGE_SIZE;
3651         case RES_LIMIT:
3652                 return (u64)counter->max * PAGE_SIZE;
3653         case RES_MAX_USAGE:
3654                 return (u64)counter->watermark * PAGE_SIZE;
3655         case RES_FAILCNT:
3656                 return counter->failcnt;
3657         case RES_SOFT_LIMIT:
3658                 return (u64)memcg->soft_limit * PAGE_SIZE;
3659         default:
3660                 BUG();
3661         }
3662 }
3663
3664 #ifdef CONFIG_MEMCG_KMEM
3665 static int memcg_online_kmem(struct mem_cgroup *memcg)
3666 {
3667         struct obj_cgroup *objcg;
3668         int memcg_id;
3669
3670         if (cgroup_memory_nokmem)
3671                 return 0;
3672
3673         if (unlikely(mem_cgroup_is_root(memcg)))
3674                 return 0;
3675
3676         memcg_id = memcg_alloc_cache_id();
3677         if (memcg_id < 0)
3678                 return memcg_id;
3679
3680         objcg = obj_cgroup_alloc();
3681         if (!objcg) {
3682                 memcg_free_cache_id(memcg_id);
3683                 return -ENOMEM;
3684         }
3685         objcg->memcg = memcg;
3686         rcu_assign_pointer(memcg->objcg, objcg);
3687
3688         static_branch_enable(&memcg_kmem_enabled_key);
3689
3690         memcg->kmemcg_id = memcg_id;
3691
3692         return 0;
3693 }
3694
3695 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3696 {
3697         struct mem_cgroup *parent;
3698         int kmemcg_id;
3699
3700         if (cgroup_memory_nokmem)
3701                 return;
3702
3703         if (unlikely(mem_cgroup_is_root(memcg)))
3704                 return;
3705
3706         parent = parent_mem_cgroup(memcg);
3707         if (!parent)
3708                 parent = root_mem_cgroup;
3709
3710         memcg_reparent_objcgs(memcg, parent);
3711
3712         kmemcg_id = memcg->kmemcg_id;
3713
3714         /*
3715          * After we have finished memcg_reparent_objcgs(), all list_lrus
3716          * corresponding to this cgroup are guaranteed to remain empty.
3717          * The ordering is imposed by list_lru_node->lock taken by
3718          * memcg_drain_all_list_lrus().
3719          */
3720         memcg_drain_all_list_lrus(kmemcg_id, parent);
3721
3722         memcg_free_cache_id(kmemcg_id);
3723 }
3724 #else
3725 static int memcg_online_kmem(struct mem_cgroup *memcg)
3726 {
3727         return 0;
3728 }
3729 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3730 {
3731 }
3732 #endif /* CONFIG_MEMCG_KMEM */
3733
3734 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3735 {
3736         int ret;
3737
3738         mutex_lock(&memcg_max_mutex);
3739
3740         ret = page_counter_set_max(&memcg->tcpmem, max);
3741         if (ret)
3742                 goto out;
3743
3744         if (!memcg->tcpmem_active) {
3745                 /*
3746                  * The active flag needs to be written after the static_key
3747                  * update. This is what guarantees that the socket activation
3748                  * function is the last one to run. See mem_cgroup_sk_alloc()
3749                  * for details, and note that we don't mark any socket as
3750                  * belonging to this memcg until that flag is up.
3751                  *
3752                  * We need to do this, because static_keys will span multiple
3753                  * sites, but we can't control their order. If we mark a socket
3754                  * as accounted, but the accounting functions are not patched in
3755                  * yet, we'll lose accounting.
3756                  *
3757                  * We never race with the readers in mem_cgroup_sk_alloc(),
3758                  * because when this value change, the code to process it is not
3759                  * patched in yet.
3760                  */
3761                 static_branch_inc(&memcg_sockets_enabled_key);
3762                 memcg->tcpmem_active = true;
3763         }
3764 out:
3765         mutex_unlock(&memcg_max_mutex);
3766         return ret;
3767 }
3768
3769 /*
3770  * The user of this function is...
3771  * RES_LIMIT.
3772  */
3773 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3774                                 char *buf, size_t nbytes, loff_t off)
3775 {
3776         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3777         unsigned long nr_pages;
3778         int ret;
3779
3780         buf = strstrip(buf);
3781         ret = page_counter_memparse(buf, "-1", &nr_pages);
3782         if (ret)
3783                 return ret;
3784
3785         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3786         case RES_LIMIT:
3787                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3788                         ret = -EINVAL;
3789                         break;
3790                 }
3791                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3792                 case _MEM:
3793                         ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3794                         break;
3795                 case _MEMSWAP:
3796                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3797                         break;
3798                 case _KMEM:
3799                         /* kmem.limit_in_bytes is deprecated. */
3800                         ret = -EOPNOTSUPP;
3801                         break;
3802                 case _TCP:
3803                         ret = memcg_update_tcp_max(memcg, nr_pages);
3804                         break;
3805                 }
3806                 break;
3807         case RES_SOFT_LIMIT:
3808                 if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
3809                         ret = -EOPNOTSUPP;
3810                 } else {
3811                         memcg->soft_limit = nr_pages;
3812                         ret = 0;
3813                 }
3814                 break;
3815         }
3816         return ret ?: nbytes;
3817 }
3818
3819 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3820                                 size_t nbytes, loff_t off)
3821 {
3822         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3823         struct page_counter *counter;
3824
3825         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3826         case _MEM:
3827                 counter = &memcg->memory;
3828                 break;
3829         case _MEMSWAP:
3830                 counter = &memcg->memsw;
3831                 break;
3832         case _KMEM:
3833                 counter = &memcg->kmem;
3834                 break;
3835         case _TCP:
3836                 counter = &memcg->tcpmem;
3837                 break;
3838         default:
3839                 BUG();
3840         }
3841
3842         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3843         case RES_MAX_USAGE:
3844                 page_counter_reset_watermark(counter);
3845                 break;
3846         case RES_FAILCNT:
3847                 counter->failcnt = 0;
3848                 break;
3849         default:
3850                 BUG();
3851         }
3852
3853         return nbytes;
3854 }
3855
3856 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3857                                         struct cftype *cft)
3858 {
3859         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3860 }
3861
3862 #ifdef CONFIG_MMU
3863 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3864                                         struct cftype *cft, u64 val)
3865 {
3866         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3867
3868         if (val & ~MOVE_MASK)
3869                 return -EINVAL;
3870
3871         /*
3872          * No kind of locking is needed in here, because ->can_attach() will
3873          * check this value once in the beginning of the process, and then carry
3874          * on with stale data. This means that changes to this value will only
3875          * affect task migrations starting after the change.
3876          */
3877         memcg->move_charge_at_immigrate = val;
3878         return 0;
3879 }
3880 #else
3881 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3882                                         struct cftype *cft, u64 val)
3883 {
3884         return -ENOSYS;
3885 }
3886 #endif
3887
3888 #ifdef CONFIG_NUMA
3889
3890 #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
3891 #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
3892 #define LRU_ALL      ((1 << NR_LRU_LISTS) - 1)
3893
3894 static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
3895                                 int nid, unsigned int lru_mask, bool tree)
3896 {
3897         struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
3898         unsigned long nr = 0;
3899         enum lru_list lru;
3900
3901         VM_BUG_ON((unsigned)nid >= nr_node_ids);
3902
3903         for_each_lru(lru) {
3904                 if (!(BIT(lru) & lru_mask))
3905                         continue;
3906                 if (tree)
3907                         nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
3908                 else
3909                         nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
3910         }
3911         return nr;
3912 }
3913
3914 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
3915                                              unsigned int lru_mask,
3916                                              bool tree)
3917 {
3918         unsigned long nr = 0;
3919         enum lru_list lru;
3920
3921         for_each_lru(lru) {
3922                 if (!(BIT(lru) & lru_mask))
3923                         continue;
3924                 if (tree)
3925                         nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
3926                 else
3927                         nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
3928         }
3929         return nr;
3930 }
3931
3932 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3933 {
3934         struct numa_stat {
3935                 const char *name;
3936                 unsigned int lru_mask;
3937         };
3938
3939         static const struct numa_stat stats[] = {
3940                 { "total", LRU_ALL },
3941                 { "file", LRU_ALL_FILE },
3942                 { "anon", LRU_ALL_ANON },
3943                 { "unevictable", BIT(LRU_UNEVICTABLE) },
3944         };
3945         const struct numa_stat *stat;
3946         int nid;
3947         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
3948
3949         mem_cgroup_flush_stats();
3950
3951         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3952                 seq_printf(m, "%s=%lu", stat->name,
3953                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3954                                                    false));
3955                 for_each_node_state(nid, N_MEMORY)
3956                         seq_printf(m, " N%d=%lu", nid,
3957                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
3958                                                         stat->lru_mask, false));
3959                 seq_putc(m, '\n');
3960         }
3961
3962         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3963
3964                 seq_printf(m, "hierarchical_%s=%lu", stat->name,
3965                            mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
3966                                                    true));
3967                 for_each_node_state(nid, N_MEMORY)
3968                         seq_printf(m, " N%d=%lu", nid,
3969                                    mem_cgroup_node_nr_lru_pages(memcg, nid,
3970                                                         stat->lru_mask, true));
3971                 seq_putc(m, '\n');
3972         }
3973
3974         return 0;
3975 }
3976 #endif /* CONFIG_NUMA */
3977
3978 static const unsigned int memcg1_stats[] = {
3979         NR_FILE_PAGES,
3980         NR_ANON_MAPPED,
3981 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3982         NR_ANON_THPS,
3983 #endif
3984         NR_SHMEM,
3985         NR_FILE_MAPPED,
3986         NR_FILE_DIRTY,
3987         NR_WRITEBACK,
3988         MEMCG_SWAP,
3989 };
3990
3991 static const char *const memcg1_stat_names[] = {
3992         "cache",
3993         "rss",
3994 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
3995         "rss_huge",
3996 #endif
3997         "shmem",
3998         "mapped_file",
3999         "dirty",
4000         "writeback",
4001         "swap",
4002 };
4003
4004 /* Universal VM events cgroup1 shows, original sort order */
4005 static const unsigned int memcg1_events[] = {
4006         PGPGIN,
4007         PGPGOUT,
4008         PGFAULT,
4009         PGMAJFAULT,
4010 };
4011
4012 static int memcg_stat_show(struct seq_file *m, void *v)
4013 {
4014         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
4015         unsigned long memory, memsw;
4016         struct mem_cgroup *mi;
4017         unsigned int i;
4018
4019         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
4020
4021         mem_cgroup_flush_stats();
4022
4023         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4024                 unsigned long nr;
4025
4026                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4027                         continue;
4028                 nr = memcg_page_state_local(memcg, memcg1_stats[i]);
4029                 seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
4030         }
4031
4032         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4033                 seq_printf(m, "%s %lu\n", vm_event_name(memcg1_events[i]),
4034                            memcg_events_local(memcg, memcg1_events[i]));
4035
4036         for (i = 0; i < NR_LRU_LISTS; i++)
4037                 seq_printf(m, "%s %lu\n", lru_list_name(i),
4038                            memcg_page_state_local(memcg, NR_LRU_BASE + i) *
4039                            PAGE_SIZE);
4040
4041         /* Hierarchical information */
4042         memory = memsw = PAGE_COUNTER_MAX;
4043         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
4044                 memory = min(memory, READ_ONCE(mi->memory.max));
4045                 memsw = min(memsw, READ_ONCE(mi->memsw.max));
4046         }
4047         seq_printf(m, "hierarchical_memory_limit %llu\n",
4048                    (u64)memory * PAGE_SIZE);
4049         if (do_memsw_account())
4050                 seq_printf(m, "hierarchical_memsw_limit %llu\n",
4051                            (u64)memsw * PAGE_SIZE);
4052
4053         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
4054                 unsigned long nr;
4055
4056                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
4057                         continue;
4058                 nr = memcg_page_state(memcg, memcg1_stats[i]);
4059                 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
4060                                                 (u64)nr * PAGE_SIZE);
4061         }
4062
4063         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
4064                 seq_printf(m, "total_%s %llu\n",
4065                            vm_event_name(memcg1_events[i]),
4066                            (u64)memcg_events(memcg, memcg1_events[i]));
4067
4068         for (i = 0; i < NR_LRU_LISTS; i++)
4069                 seq_printf(m, "total_%s %llu\n", lru_list_name(i),
4070                            (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
4071                            PAGE_SIZE);
4072
4073 #ifdef CONFIG_DEBUG_VM
4074         {
4075                 pg_data_t *pgdat;
4076                 struct mem_cgroup_per_node *mz;
4077                 unsigned long anon_cost = 0;
4078                 unsigned long file_cost = 0;
4079
4080                 for_each_online_pgdat(pgdat) {
4081                         mz = memcg->nodeinfo[pgdat->node_id];
4082
4083                         anon_cost += mz->lruvec.anon_cost;
4084                         file_cost += mz->lruvec.file_cost;
4085                 }
4086                 seq_printf(m, "anon_cost %lu\n", anon_cost);
4087                 seq_printf(m, "file_cost %lu\n", file_cost);
4088         }
4089 #endif
4090
4091         return 0;
4092 }
4093
4094 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
4095                                       struct cftype *cft)
4096 {
4097         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4098
4099         return mem_cgroup_swappiness(memcg);
4100 }
4101
4102 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
4103                                        struct cftype *cft, u64 val)
4104 {
4105         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4106
4107         if (val > 200)
4108                 return -EINVAL;
4109
4110         if (!mem_cgroup_is_root(memcg))
4111                 memcg->swappiness = val;
4112         else
4113                 vm_swappiness = val;
4114
4115         return 0;
4116 }
4117
4118 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
4119 {
4120         struct mem_cgroup_threshold_ary *t;
4121         unsigned long usage;
4122         int i;
4123
4124         rcu_read_lock();
4125         if (!swap)
4126                 t = rcu_dereference(memcg->thresholds.primary);
4127         else
4128                 t = rcu_dereference(memcg->memsw_thresholds.primary);
4129
4130         if (!t)
4131                 goto unlock;
4132
4133         usage = mem_cgroup_usage(memcg, swap);
4134
4135         /*
4136          * current_threshold points to threshold just below or equal to usage.
4137          * If it's not true, a threshold was crossed after last
4138          * call of __mem_cgroup_threshold().
4139          */
4140         i = t->current_threshold;
4141
4142         /*
4143          * Iterate backward over array of thresholds starting from
4144          * current_threshold and check if a threshold is crossed.
4145          * If none of thresholds below usage is crossed, we read
4146          * only one element of the array here.
4147          */
4148         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
4149                 eventfd_signal(t->entries[i].eventfd, 1);
4150
4151         /* i = current_threshold + 1 */
4152         i++;
4153
4154         /*
4155          * Iterate forward over array of thresholds starting from
4156          * current_threshold+1 and check if a threshold is crossed.
4157          * If none of thresholds above usage is crossed, we read
4158          * only one element of the array here.
4159          */
4160         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
4161                 eventfd_signal(t->entries[i].eventfd, 1);
4162
4163         /* Update current_threshold */
4164         t->current_threshold = i - 1;
4165 unlock:
4166         rcu_read_unlock();
4167 }
4168
4169 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
4170 {
4171         while (memcg) {
4172                 __mem_cgroup_threshold(memcg, false);
4173                 if (do_memsw_account())
4174                         __mem_cgroup_threshold(memcg, true);
4175
4176                 memcg = parent_mem_cgroup(memcg);
4177         }
4178 }
4179
4180 static int compare_thresholds(const void *a, const void *b)
4181 {
4182         const struct mem_cgroup_threshold *_a = a;
4183         const struct mem_cgroup_threshold *_b = b;
4184
4185         if (_a->threshold > _b->threshold)
4186                 return 1;
4187
4188         if (_a->threshold < _b->threshold)
4189                 return -1;
4190
4191         return 0;
4192 }
4193
4194 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
4195 {
4196         struct mem_cgroup_eventfd_list *ev;
4197
4198         spin_lock(&memcg_oom_lock);
4199
4200         list_for_each_entry(ev, &memcg->oom_notify, list)
4201                 eventfd_signal(ev->eventfd, 1);
4202
4203         spin_unlock(&memcg_oom_lock);
4204         return 0;
4205 }
4206
4207 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
4208 {
4209         struct mem_cgroup *iter;
4210
4211         for_each_mem_cgroup_tree(iter, memcg)
4212                 mem_cgroup_oom_notify_cb(iter);
4213 }
4214
4215 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4216         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
4217 {
4218         struct mem_cgroup_thresholds *thresholds;
4219         struct mem_cgroup_threshold_ary *new;
4220         unsigned long threshold;
4221         unsigned long usage;
4222         int i, size, ret;
4223
4224         ret = page_counter_memparse(args, "-1", &threshold);
4225         if (ret)
4226                 return ret;
4227
4228         mutex_lock(&memcg->thresholds_lock);
4229
4230         if (type == _MEM) {
4231                 thresholds = &memcg->thresholds;
4232                 usage = mem_cgroup_usage(memcg, false);
4233         } else if (type == _MEMSWAP) {
4234                 thresholds = &memcg->memsw_thresholds;
4235                 usage = mem_cgroup_usage(memcg, true);
4236         } else
4237                 BUG();
4238
4239         /* Check if a threshold crossed before adding a new one */
4240         if (thresholds->primary)
4241                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4242
4243         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
4244
4245         /* Allocate memory for new array of thresholds */
4246         new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
4247         if (!new) {
4248                 ret = -ENOMEM;
4249                 goto unlock;
4250         }
4251         new->size = size;
4252
4253         /* Copy thresholds (if any) to new array */
4254         if (thresholds->primary)
4255                 memcpy(new->entries, thresholds->primary->entries,
4256                        flex_array_size(new, entries, size - 1));
4257
4258         /* Add new threshold */
4259         new->entries[size - 1].eventfd = eventfd;
4260         new->entries[size - 1].threshold = threshold;
4261
4262         /* Sort thresholds. Registering of new threshold isn't time-critical */
4263         sort(new->entries, size, sizeof(*new->entries),
4264                         compare_thresholds, NULL);
4265
4266         /* Find current threshold */
4267         new->current_threshold = -1;
4268         for (i = 0; i < size; i++) {
4269                 if (new->entries[i].threshold <= usage) {
4270                         /*
4271                          * new->current_threshold will not be used until
4272                          * rcu_assign_pointer(), so it's safe to increment
4273                          * it here.
4274                          */
4275                         ++new->current_threshold;
4276                 } else
4277                         break;
4278         }
4279
4280         /* Free old spare buffer and save old primary buffer as spare */
4281         kfree(thresholds->spare);
4282         thresholds->spare = thresholds->primary;
4283
4284         rcu_assign_pointer(thresholds->primary, new);
4285
4286         /* To be sure that nobody uses thresholds */
4287         synchronize_rcu();
4288
4289 unlock:
4290         mutex_unlock(&memcg->thresholds_lock);
4291
4292         return ret;
4293 }
4294
4295 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
4296         struct eventfd_ctx *eventfd, const char *args)
4297 {
4298         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
4299 }
4300
4301 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
4302         struct eventfd_ctx *eventfd, const char *args)
4303 {
4304         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
4305 }
4306
4307 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4308         struct eventfd_ctx *eventfd, enum res_type type)
4309 {
4310         struct mem_cgroup_thresholds *thresholds;
4311         struct mem_cgroup_threshold_ary *new;
4312         unsigned long usage;
4313         int i, j, size, entries;
4314
4315         mutex_lock(&memcg->thresholds_lock);
4316
4317         if (type == _MEM) {
4318                 thresholds = &memcg->thresholds;
4319                 usage = mem_cgroup_usage(memcg, false);
4320         } else if (type == _MEMSWAP) {
4321                 thresholds = &memcg->memsw_thresholds;
4322                 usage = mem_cgroup_usage(memcg, true);
4323         } else
4324                 BUG();
4325
4326         if (!thresholds->primary)
4327                 goto unlock;
4328
4329         /* Check if a threshold crossed before removing */
4330         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
4331
4332         /* Calculate new number of threshold */
4333         size = entries = 0;
4334         for (i = 0; i < thresholds->primary->size; i++) {
4335                 if (thresholds->primary->entries[i].eventfd != eventfd)
4336                         size++;
4337                 else
4338                         entries++;
4339         }
4340
4341         new = thresholds->spare;
4342
4343         /* If no items related to eventfd have been cleared, nothing to do */
4344         if (!entries)
4345                 goto unlock;
4346
4347         /* Set thresholds array to NULL if we don't have thresholds */
4348         if (!size) {
4349                 kfree(new);
4350                 new = NULL;
4351                 goto swap_buffers;
4352         }
4353
4354         new->size = size;
4355
4356         /* Copy thresholds and find current threshold */
4357         new->current_threshold = -1;
4358         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
4359                 if (thresholds->primary->entries[i].eventfd == eventfd)
4360                         continue;
4361
4362                 new->entries[j] = thresholds->primary->entries[i];
4363                 if (new->entries[j].threshold <= usage) {
4364                         /*
4365                          * new->current_threshold will not be used
4366                          * until rcu_assign_pointer(), so it's safe to increment
4367                          * it here.
4368                          */
4369                         ++new->current_threshold;
4370                 }
4371                 j++;
4372         }
4373
4374 swap_buffers:
4375         /* Swap primary and spare array */
4376         thresholds->spare = thresholds->primary;
4377
4378         rcu_assign_pointer(thresholds->primary, new);
4379
4380         /* To be sure that nobody uses thresholds */
4381         synchronize_rcu();
4382
4383         /* If all events are unregistered, free the spare array */
4384         if (!new) {
4385                 kfree(thresholds->spare);
4386                 thresholds->spare = NULL;
4387         }
4388 unlock:
4389         mutex_unlock(&memcg->thresholds_lock);
4390 }
4391
4392 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4393         struct eventfd_ctx *eventfd)
4394 {
4395         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
4396 }
4397
4398 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
4399         struct eventfd_ctx *eventfd)
4400 {
4401         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
4402 }
4403
4404 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
4405         struct eventfd_ctx *eventfd, const char *args)
4406 {
4407         struct mem_cgroup_eventfd_list *event;
4408
4409         event = kmalloc(sizeof(*event), GFP_KERNEL);
4410         if (!event)
4411                 return -ENOMEM;
4412
4413         spin_lock(&memcg_oom_lock);
4414
4415         event->eventfd = eventfd;
4416         list_add(&event->list, &memcg->oom_notify);
4417
4418         /* already in OOM ? */
4419         if (memcg->under_oom)
4420                 eventfd_signal(eventfd, 1);
4421         spin_unlock(&memcg_oom_lock);
4422
4423         return 0;
4424 }
4425
4426 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
4427         struct eventfd_ctx *eventfd)
4428 {
4429         struct mem_cgroup_eventfd_list *ev, *tmp;
4430
4431         spin_lock(&memcg_oom_lock);
4432
4433         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
4434                 if (ev->eventfd == eventfd) {
4435                         list_del(&ev->list);
4436                         kfree(ev);
4437                 }
4438         }
4439
4440         spin_unlock(&memcg_oom_lock);
4441 }
4442
4443 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
4444 {
4445         struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
4446
4447         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
4448         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
4449         seq_printf(sf, "oom_kill %lu\n",
4450                    atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
4451         return 0;
4452 }
4453
4454 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
4455         struct cftype *cft, u64 val)
4456 {
4457         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4458
4459         /* cannot set to root cgroup and only 0 and 1 are allowed */
4460         if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
4461                 return -EINVAL;
4462
4463         memcg->oom_kill_disable = val;
4464         if (!val)
4465                 memcg_oom_recover(memcg);
4466
4467         return 0;
4468 }
4469
4470 #ifdef CONFIG_CGROUP_WRITEBACK
4471
4472 #include <trace/events/writeback.h>
4473
4474 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4475 {
4476         return wb_domain_init(&memcg->cgwb_domain, gfp);
4477 }
4478
4479 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4480 {
4481         wb_domain_exit(&memcg->cgwb_domain);
4482 }
4483
4484 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4485 {
4486         wb_domain_size_changed(&memcg->cgwb_domain);
4487 }
4488
4489 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
4490 {
4491         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4492
4493         if (!memcg->css.parent)
4494                 return NULL;
4495
4496         return &memcg->cgwb_domain;
4497 }
4498
4499 /**
4500  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
4501  * @wb: bdi_writeback in question
4502  * @pfilepages: out parameter for number of file pages
4503  * @pheadroom: out parameter for number of allocatable pages according to memcg
4504  * @pdirty: out parameter for number of dirty pages
4505  * @pwriteback: out parameter for number of pages under writeback
4506  *
4507  * Determine the numbers of file, headroom, dirty, and writeback pages in
4508  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
4509  * is a bit more involved.
4510  *
4511  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
4512  * headroom is calculated as the lowest headroom of itself and the
4513  * ancestors.  Note that this doesn't consider the actual amount of
4514  * available memory in the system.  The caller should further cap
4515  * *@pheadroom accordingly.
4516  */
4517 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
4518                          unsigned long *pheadroom, unsigned long *pdirty,
4519                          unsigned long *pwriteback)
4520 {
4521         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4522         struct mem_cgroup *parent;
4523
4524         mem_cgroup_flush_stats();
4525
4526         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
4527         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
4528         *pfilepages = memcg_page_state(memcg, NR_INACTIVE_FILE) +
4529                         memcg_page_state(memcg, NR_ACTIVE_FILE);
4530
4531         *pheadroom = PAGE_COUNTER_MAX;
4532         while ((parent = parent_mem_cgroup(memcg))) {
4533                 unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
4534                                             READ_ONCE(memcg->memory.high));
4535                 unsigned long used = page_counter_read(&memcg->memory);
4536
4537                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
4538                 memcg = parent;
4539         }
4540 }
4541
4542 /*
4543  * Foreign dirty flushing
4544  *
4545  * There's an inherent mismatch between memcg and writeback.  The former
4546  * tracks ownership per-page while the latter per-inode.  This was a
4547  * deliberate design decision because honoring per-page ownership in the
4548  * writeback path is complicated, may lead to higher CPU and IO overheads
4549  * and deemed unnecessary given that write-sharing an inode across
4550  * different cgroups isn't a common use-case.
4551  *
4552  * Combined with inode majority-writer ownership switching, this works well
4553  * enough in most cases but there are some pathological cases.  For
4554  * example, let's say there are two cgroups A and B which keep writing to
4555  * different but confined parts of the same inode.  B owns the inode and
4556  * A's memory is limited far below B's.  A's dirty ratio can rise enough to
4557  * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
4558  * triggering background writeback.  A will be slowed down without a way to
4559  * make writeback of the dirty pages happen.
4560  *
4561  * Conditions like the above can lead to a cgroup getting repeatedly and
4562  * severely throttled after making some progress after each
4563  * dirty_expire_interval while the underlying IO device is almost
4564  * completely idle.
4565  *
4566  * Solving this problem completely requires matching the ownership tracking
4567  * granularities between memcg and writeback in either direction.  However,
4568  * the more egregious behaviors can be avoided by simply remembering the
4569  * most recent foreign dirtying events and initiating remote flushes on
4570  * them when local writeback isn't enough to keep the memory clean enough.
4571  *
4572  * The following two functions implement such mechanism.  When a foreign
4573  * page - a page whose memcg and writeback ownerships don't match - is
4574  * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
4575  * bdi_writeback on the page owning memcg.  When balance_dirty_pages()
4576  * decides that the memcg needs to sleep due to high dirty ratio, it calls
4577  * mem_cgroup_flush_foreign() which queues writeback on the recorded
4578  * foreign bdi_writebacks which haven't expired.  Both the numbers of
4579  * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
4580  * limited to MEMCG_CGWB_FRN_CNT.
4581  *
4582  * The mechanism only remembers IDs and doesn't hold any object references.
4583  * As being wrong occasionally doesn't matter, updates and accesses to the
4584  * records are lockless and racy.
4585  */
4586 void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
4587                                              struct bdi_writeback *wb)
4588 {
4589         struct mem_cgroup *memcg = folio_memcg(folio);
4590         struct memcg_cgwb_frn *frn;
4591         u64 now = get_jiffies_64();
4592         u64 oldest_at = now;
4593         int oldest = -1;
4594         int i;
4595
4596         trace_track_foreign_dirty(folio, wb);
4597
4598         /*
4599          * Pick the slot to use.  If there is already a slot for @wb, keep
4600          * using it.  If not replace the oldest one which isn't being
4601          * written out.
4602          */
4603         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4604                 frn = &memcg->cgwb_frn[i];
4605                 if (frn->bdi_id == wb->bdi->id &&
4606                     frn->memcg_id == wb->memcg_css->id)
4607                         break;
4608                 if (time_before64(frn->at, oldest_at) &&
4609                     atomic_read(&frn->done.cnt) == 1) {
4610                         oldest = i;
4611                         oldest_at = frn->at;
4612                 }
4613         }
4614
4615         if (i < MEMCG_CGWB_FRN_CNT) {
4616                 /*
4617                  * Re-using an existing one.  Update timestamp lazily to
4618                  * avoid making the cacheline hot.  We want them to be
4619                  * reasonably up-to-date and significantly shorter than
4620                  * dirty_expire_interval as that's what expires the record.
4621                  * Use the shorter of 1s and dirty_expire_interval / 8.
4622                  */
4623                 unsigned long update_intv =
4624                         min_t(unsigned long, HZ,
4625                               msecs_to_jiffies(dirty_expire_interval * 10) / 8);
4626
4627                 if (time_before64(frn->at, now - update_intv))
4628                         frn->at = now;
4629         } else if (oldest >= 0) {
4630                 /* replace the oldest free one */
4631                 frn = &memcg->cgwb_frn[oldest];
4632                 frn->bdi_id = wb->bdi->id;
4633                 frn->memcg_id = wb->memcg_css->id;
4634                 frn->at = now;
4635         }
4636 }
4637
4638 /* issue foreign writeback flushes for recorded foreign dirtying events */
4639 void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
4640 {
4641         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
4642         unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
4643         u64 now = jiffies_64;
4644         int i;
4645
4646         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
4647                 struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
4648
4649                 /*
4650                  * If the record is older than dirty_expire_interval,
4651                  * writeback on it has already started.  No need to kick it
4652                  * off again.  Also, don't start a new one if there's
4653                  * already one in flight.
4654                  */
4655                 if (time_after64(frn->at, now - intv) &&
4656                     atomic_read(&frn->done.cnt) == 1) {
4657                         frn->at = 0;
4658                         trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
4659                         cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id,
4660                                                WB_REASON_FOREIGN_FLUSH,
4661                                                &frn->done);
4662                 }
4663         }
4664 }
4665
4666 #else   /* CONFIG_CGROUP_WRITEBACK */
4667
4668 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
4669 {
4670         return 0;
4671 }
4672
4673 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
4674 {
4675 }
4676
4677 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
4678 {
4679 }
4680
4681 #endif  /* CONFIG_CGROUP_WRITEBACK */
4682
4683 /*
4684  * DO NOT USE IN NEW FILES.
4685  *
4686  * "cgroup.event_control" implementation.
4687  *
4688  * This is way over-engineered.  It tries to support fully configurable
4689  * events for each user.  Such level of flexibility is completely
4690  * unnecessary especially in the light of the planned unified hierarchy.
4691  *
4692  * Please deprecate this and replace with something simpler if at all
4693  * possible.
4694  */
4695
4696 /*
4697  * Unregister event and free resources.
4698  *
4699  * Gets called from workqueue.
4700  */
4701 static void memcg_event_remove(struct work_struct *work)
4702 {
4703         struct mem_cgroup_event *event =
4704                 container_of(work, struct mem_cgroup_event, remove);
4705         struct mem_cgroup *memcg = event->memcg;
4706
4707         remove_wait_queue(event->wqh, &event->wait);
4708
4709         event->unregister_event(memcg, event->eventfd);
4710
4711         /* Notify userspace the event is going away. */
4712         eventfd_signal(event->eventfd, 1);
4713
4714         eventfd_ctx_put(event->eventfd);
4715         kfree(event);
4716         css_put(&memcg->css);
4717 }
4718
4719 /*
4720  * Gets called on EPOLLHUP on eventfd when user closes it.
4721  *
4722  * Called with wqh->lock held and interrupts disabled.
4723  */
4724 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
4725                             int sync, void *key)
4726 {
4727         struct mem_cgroup_event *event =
4728                 container_of(wait, struct mem_cgroup_event, wait);
4729         struct mem_cgroup *memcg = event->memcg;
4730         __poll_t flags = key_to_poll(key);
4731
4732         if (flags & EPOLLHUP) {
4733                 /*
4734                  * If the event has been detached at cgroup removal, we
4735                  * can simply return knowing the other side will cleanup
4736                  * for us.
4737                  *
4738                  * We can't race against event freeing since the other
4739                  * side will require wqh->lock via remove_wait_queue(),
4740                  * which we hold.
4741                  */
4742                 spin_lock(&memcg->event_list_lock);
4743                 if (!list_empty(&event->list)) {
4744                         list_del_init(&event->list);
4745                         /*
4746                          * We are in atomic context, but cgroup_event_remove()
4747                          * may sleep, so we have to call it in workqueue.
4748                          */
4749                         schedule_work(&event->remove);
4750                 }
4751                 spin_unlock(&memcg->event_list_lock);
4752         }
4753
4754         return 0;
4755 }
4756
4757 static void memcg_event_ptable_queue_proc(struct file *file,
4758                 wait_queue_head_t *wqh, poll_table *pt)
4759 {
4760         struct mem_cgroup_event *event =
4761                 container_of(pt, struct mem_cgroup_event, pt);
4762
4763         event->wqh = wqh;
4764         add_wait_queue(wqh, &event->wait);
4765 }
4766
4767 /*
4768  * DO NOT USE IN NEW FILES.
4769  *
4770  * Parse input and register new cgroup event handler.
4771  *
4772  * Input must be in format '<event_fd> <control_fd> <args>'.
4773  * Interpretation of args is defined by control file implementation.
4774  */
4775 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4776                                          char *buf, size_t nbytes, loff_t off)
4777 {
4778         struct cgroup_subsys_state *css = of_css(of);
4779         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4780         struct mem_cgroup_event *event;
4781         struct cgroup_subsys_state *cfile_css;
4782         unsigned int efd, cfd;
4783         struct fd efile;
4784         struct fd cfile;
4785         const char *name;
4786         char *endp;
4787         int ret;
4788
4789         if (IS_ENABLED(CONFIG_PREEMPT_RT))
4790                 return -EOPNOTSUPP;
4791
4792         buf = strstrip(buf);
4793
4794         efd = simple_strtoul(buf, &endp, 10);
4795         if (*endp != ' ')
4796                 return -EINVAL;
4797         buf = endp + 1;
4798
4799         cfd = simple_strtoul(buf, &endp, 10);
4800         if ((*endp != ' ') && (*endp != '\0'))
4801                 return -EINVAL;
4802         buf = endp + 1;
4803
4804         event = kzalloc(sizeof(*event), GFP_KERNEL);
4805         if (!event)
4806                 return -ENOMEM;
4807
4808         event->memcg = memcg;
4809         INIT_LIST_HEAD(&event->list);
4810         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4811         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4812         INIT_WORK(&event->remove, memcg_event_remove);
4813
4814         efile = fdget(efd);
4815         if (!efile.file) {
4816                 ret = -EBADF;
4817                 goto out_kfree;
4818         }
4819
4820         event->eventfd = eventfd_ctx_fileget(efile.file);
4821         if (IS_ERR(event->eventfd)) {
4822                 ret = PTR_ERR(event->eventfd);
4823                 goto out_put_efile;
4824         }
4825
4826         cfile = fdget(cfd);
4827         if (!cfile.file) {
4828                 ret = -EBADF;
4829                 goto out_put_eventfd;
4830         }
4831
4832         /* the process need read permission on control file */
4833         /* AV: shouldn't we check that it's been opened for read instead? */
4834         ret = file_permission(cfile.file, MAY_READ);
4835         if (ret < 0)
4836                 goto out_put_cfile;
4837
4838         /*
4839          * Determine the event callbacks and set them in @event.  This used
4840          * to be done via struct cftype but cgroup core no longer knows
4841          * about these events.  The following is crude but the whole thing
4842          * is for compatibility anyway.
4843          *
4844          * DO NOT ADD NEW FILES.
4845          */
4846         name = cfile.file->f_path.dentry->d_name.name;
4847
4848         if (!strcmp(name, "memory.usage_in_bytes")) {
4849                 event->register_event = mem_cgroup_usage_register_event;
4850                 event->unregister_event = mem_cgroup_usage_unregister_event;
4851         } else if (!strcmp(name, "memory.oom_control")) {
4852                 event->register_event = mem_cgroup_oom_register_event;
4853                 event->unregister_event = mem_cgroup_oom_unregister_event;
4854         } else if (!strcmp(name, "memory.pressure_level")) {
4855                 event->register_event = vmpressure_register_event;
4856                 event->unregister_event = vmpressure_unregister_event;
4857         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4858                 event->register_event = memsw_cgroup_usage_register_event;
4859                 event->unregister_event = memsw_cgroup_usage_unregister_event;
4860         } else {
4861                 ret = -EINVAL;
4862                 goto out_put_cfile;
4863         }
4864
4865         /*
4866          * Verify @cfile should belong to @css.  Also, remaining events are
4867          * automatically removed on cgroup destruction but the removal is
4868          * asynchronous, so take an extra ref on @css.
4869          */
4870         cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4871                                                &memory_cgrp_subsys);
4872         ret = -EINVAL;
4873         if (IS_ERR(cfile_css))
4874                 goto out_put_cfile;
4875         if (cfile_css != css) {
4876                 css_put(cfile_css);
4877                 goto out_put_cfile;
4878         }
4879
4880         ret = event->register_event(memcg, event->eventfd, buf);
4881         if (ret)
4882                 goto out_put_css;
4883
4884         vfs_poll(efile.file, &event->pt);
4885
4886         spin_lock_irq(&memcg->event_list_lock);
4887         list_add(&event->list, &memcg->event_list);
4888         spin_unlock_irq(&memcg->event_list_lock);
4889
4890         fdput(cfile);
4891         fdput(efile);
4892
4893         return nbytes;
4894
4895 out_put_css:
4896         css_put(css);
4897 out_put_cfile:
4898         fdput(cfile);
4899 out_put_eventfd:
4900         eventfd_ctx_put(event->eventfd);
4901 out_put_efile:
4902         fdput(efile);
4903 out_kfree:
4904         kfree(event);
4905
4906         return ret;
4907 }
4908
4909 #if defined(CONFIG_MEMCG_KMEM) && (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
4910 static int mem_cgroup_slab_show(struct seq_file *m, void *p)
4911 {
4912         /*
4913          * Deprecated.
4914          * Please, take a look at tools/cgroup/slabinfo.py .
4915          */
4916         return 0;
4917 }
4918 #endif
4919
4920 static struct cftype mem_cgroup_legacy_files[] = {
4921         {
4922                 .name = "usage_in_bytes",
4923                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4924                 .read_u64 = mem_cgroup_read_u64,
4925         },
4926         {
4927                 .name = "max_usage_in_bytes",
4928                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4929                 .write = mem_cgroup_reset,
4930                 .read_u64 = mem_cgroup_read_u64,
4931         },
4932         {
4933                 .name = "limit_in_bytes",
4934                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4935                 .write = mem_cgroup_write,
4936                 .read_u64 = mem_cgroup_read_u64,
4937         },
4938         {
4939                 .name = "soft_limit_in_bytes",
4940                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4941                 .write = mem_cgroup_write,
4942                 .read_u64 = mem_cgroup_read_u64,
4943         },
4944         {
4945                 .name = "failcnt",
4946                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4947                 .write = mem_cgroup_reset,
4948                 .read_u64 = mem_cgroup_read_u64,
4949         },
4950         {
4951                 .name = "stat",
4952                 .seq_show = memcg_stat_show,
4953         },
4954         {
4955                 .name = "force_empty",
4956                 .write = mem_cgroup_force_empty_write,
4957         },
4958         {
4959                 .name = "use_hierarchy",
4960                 .write_u64 = mem_cgroup_hierarchy_write,
4961                 .read_u64 = mem_cgroup_hierarchy_read,
4962         },
4963         {
4964                 .name = "cgroup.event_control",         /* XXX: for compat */
4965                 .write = memcg_write_event_control,
4966                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4967         },
4968         {
4969                 .name = "swappiness",
4970                 .read_u64 = mem_cgroup_swappiness_read,
4971                 .write_u64 = mem_cgroup_swappiness_write,
4972         },
4973         {
4974                 .name = "move_charge_at_immigrate",
4975                 .read_u64 = mem_cgroup_move_charge_read,
4976                 .write_u64 = mem_cgroup_move_charge_write,
4977         },
4978         {
4979                 .name = "oom_control",
4980                 .seq_show = mem_cgroup_oom_control_read,
4981                 .write_u64 = mem_cgroup_oom_control_write,
4982                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4983         },
4984         {
4985                 .name = "pressure_level",
4986         },
4987 #ifdef CONFIG_NUMA
4988         {
4989                 .name = "numa_stat",
4990                 .seq_show = memcg_numa_stat_show,
4991         },
4992 #endif
4993         {
4994                 .name = "kmem.limit_in_bytes",
4995                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4996                 .write = mem_cgroup_write,
4997                 .read_u64 = mem_cgroup_read_u64,
4998         },
4999         {
5000                 .name = "kmem.usage_in_bytes",
5001                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
5002                 .read_u64 = mem_cgroup_read_u64,
5003         },
5004         {
5005                 .name = "kmem.failcnt",
5006                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
5007                 .write = mem_cgroup_reset,
5008                 .read_u64 = mem_cgroup_read_u64,
5009         },
5010         {
5011                 .name = "kmem.max_usage_in_bytes",
5012                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
5013                 .write = mem_cgroup_reset,
5014                 .read_u64 = mem_cgroup_read_u64,
5015         },
5016 #if defined(CONFIG_MEMCG_KMEM) && \
5017         (defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
5018         {
5019                 .name = "kmem.slabinfo",
5020                 .seq_show = mem_cgroup_slab_show,
5021         },
5022 #endif
5023         {
5024                 .name = "kmem.tcp.limit_in_bytes",
5025                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
5026                 .write = mem_cgroup_write,
5027                 .read_u64 = mem_cgroup_read_u64,
5028         },
5029         {
5030                 .name = "kmem.tcp.usage_in_bytes",
5031                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
5032                 .read_u64 = mem_cgroup_read_u64,
5033         },
5034         {
5035                 .name = "kmem.tcp.failcnt",
5036                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
5037                 .write = mem_cgroup_reset,
5038                 .read_u64 = mem_cgroup_read_u64,
5039         },
5040         {
5041                 .name = "kmem.tcp.max_usage_in_bytes",
5042                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
5043                 .write = mem_cgroup_reset,
5044                 .read_u64 = mem_cgroup_read_u64,
5045         },
5046         { },    /* terminate */
5047 };
5048
5049 /*
5050  * Private memory cgroup IDR
5051  *
5052  * Swap-out records and page cache shadow entries need to store memcg
5053  * references in constrained space, so we maintain an ID space that is
5054  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
5055  * memory-controlled cgroups to 64k.
5056  *
5057  * However, there usually are many references to the offline CSS after
5058  * the cgroup has been destroyed, such as page cache or reclaimable
5059  * slab objects, that don't need to hang on to the ID. We want to keep
5060  * those dead CSS from occupying IDs, or we might quickly exhaust the
5061  * relatively small ID space and prevent the creation of new cgroups
5062  * even when there are much fewer than 64k cgroups - possibly none.
5063  *
5064  * Maintain a private 16-bit ID space for memcg, and allow the ID to
5065  * be freed and recycled when it's no longer needed, which is usually
5066  * when the CSS is offlined.
5067  *
5068  * The only exception to that are records of swapped out tmpfs/shmem
5069  * pages that need to be attributed to live ancestors on swapin. But
5070  * those references are manageable from userspace.
5071  */
5072
5073 static DEFINE_IDR(mem_cgroup_idr);
5074
5075 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
5076 {
5077         if (memcg->id.id > 0) {
5078                 idr_remove(&mem_cgroup_idr, memcg->id.id);
5079                 memcg->id.id = 0;
5080         }
5081 }
5082
5083 static void __maybe_unused mem_cgroup_id_get_many(struct mem_cgroup *memcg,
5084                                                   unsigned int n)
5085 {
5086         refcount_add(n, &memcg->id.ref);
5087 }
5088
5089 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
5090 {
5091         if (refcount_sub_and_test(n, &memcg->id.ref)) {
5092                 mem_cgroup_id_remove(memcg);
5093
5094                 /* Memcg ID pins CSS */
5095                 css_put(&memcg->css);
5096         }
5097 }
5098
5099 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
5100 {
5101         mem_cgroup_id_put_many(memcg, 1);
5102 }
5103
5104 /**
5105  * mem_cgroup_from_id - look up a memcg from a memcg id
5106  * @id: the memcg id to look up
5107  *
5108  * Caller must hold rcu_read_lock().
5109  */
5110 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
5111 {
5112         WARN_ON_ONCE(!rcu_read_lock_held());
5113         return idr_find(&mem_cgroup_idr, id);
5114 }
5115
5116 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5117 {
5118         struct mem_cgroup_per_node *pn;
5119         int tmp = node;
5120         /*
5121          * This routine is called against possible nodes.
5122          * But it's BUG to call kmalloc() against offline node.
5123          *
5124          * TODO: this routine can waste much memory for nodes which will
5125          *       never be onlined. It's better to use memory hotplug callback
5126          *       function.
5127          */
5128         if (!node_state(node, N_NORMAL_MEMORY))
5129                 tmp = -1;
5130         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
5131         if (!pn)
5132                 return 1;
5133
5134         pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5135                                                    GFP_KERNEL_ACCOUNT);
5136         if (!pn->lruvec_stats_percpu) {
5137                 kfree(pn);
5138                 return 1;
5139         }
5140
5141         lruvec_init(&pn->lruvec);
5142         pn->memcg = memcg;
5143
5144         memcg->nodeinfo[node] = pn;
5145         return 0;
5146 }
5147
5148 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
5149 {
5150         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
5151
5152         if (!pn)
5153                 return;
5154
5155         free_percpu(pn->lruvec_stats_percpu);
5156         kfree(pn);
5157 }
5158
5159 static void __mem_cgroup_free(struct mem_cgroup *memcg)
5160 {
5161         int node;
5162
5163         for_each_node(node)
5164                 free_mem_cgroup_per_node_info(memcg, node);
5165         free_percpu(memcg->vmstats_percpu);
5166         kfree(memcg);
5167 }
5168
5169 static void mem_cgroup_free(struct mem_cgroup *memcg)
5170 {
5171         memcg_wb_domain_exit(memcg);
5172         __mem_cgroup_free(memcg);
5173 }
5174
5175 static struct mem_cgroup *mem_cgroup_alloc(void)
5176 {
5177         struct mem_cgroup *memcg;
5178         int node;
5179         int __maybe_unused i;
5180         long error = -ENOMEM;
5181
5182         memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
5183         if (!memcg)
5184                 return ERR_PTR(error);
5185
5186         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
5187                                  1, MEM_CGROUP_ID_MAX,
5188                                  GFP_KERNEL);
5189         if (memcg->id.id < 0) {
5190                 error = memcg->id.id;
5191                 goto fail;
5192         }
5193
5194         memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
5195                                                  GFP_KERNEL_ACCOUNT);
5196         if (!memcg->vmstats_percpu)
5197                 goto fail;
5198
5199         for_each_node(node)
5200                 if (alloc_mem_cgroup_per_node_info(memcg, node))
5201                         goto fail;
5202
5203         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
5204                 goto fail;
5205
5206         INIT_WORK(&memcg->high_work, high_work_func);
5207         INIT_LIST_HEAD(&memcg->oom_notify);
5208         mutex_init(&memcg->thresholds_lock);
5209         spin_lock_init(&memcg->move_lock);
5210         vmpressure_init(&memcg->vmpressure);
5211         INIT_LIST_HEAD(&memcg->event_list);
5212         spin_lock_init(&memcg->event_list_lock);
5213         memcg->socket_pressure = jiffies;
5214 #ifdef CONFIG_MEMCG_KMEM
5215         memcg->kmemcg_id = -1;
5216         INIT_LIST_HEAD(&memcg->objcg_list);
5217 #endif
5218 #ifdef CONFIG_CGROUP_WRITEBACK
5219         INIT_LIST_HEAD(&memcg->cgwb_list);
5220         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5221                 memcg->cgwb_frn[i].done =
5222                         __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
5223 #endif
5224 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5225         spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
5226         INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
5227         memcg->deferred_split_queue.split_queue_len = 0;
5228 #endif
5229         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
5230         return memcg;
5231 fail:
5232         mem_cgroup_id_remove(memcg);
5233         __mem_cgroup_free(memcg);
5234         return ERR_PTR(error);
5235 }
5236
5237 static struct cgroup_subsys_state * __ref
5238 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
5239 {
5240         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
5241         struct mem_cgroup *memcg, *old_memcg;
5242
5243         old_memcg = set_active_memcg(parent);
5244         memcg = mem_cgroup_alloc();
5245         set_active_memcg(old_memcg);
5246         if (IS_ERR(memcg))
5247                 return ERR_CAST(memcg);
5248
5249         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5250         memcg->soft_limit = PAGE_COUNTER_MAX;
5251         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5252         if (parent) {
5253                 memcg->swappiness = mem_cgroup_swappiness(parent);
5254                 memcg->oom_kill_disable = parent->oom_kill_disable;
5255
5256                 page_counter_init(&memcg->memory, &parent->memory);
5257                 page_counter_init(&memcg->swap, &parent->swap);
5258                 page_counter_init(&memcg->kmem, &parent->kmem);
5259                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
5260         } else {
5261                 page_counter_init(&memcg->memory, NULL);
5262                 page_counter_init(&memcg->swap, NULL);
5263                 page_counter_init(&memcg->kmem, NULL);
5264                 page_counter_init(&memcg->tcpmem, NULL);
5265
5266                 root_mem_cgroup = memcg;
5267                 return &memcg->css;
5268         }
5269
5270         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5271                 static_branch_inc(&memcg_sockets_enabled_key);
5272
5273         return &memcg->css;
5274 }
5275
5276 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
5277 {
5278         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5279
5280         if (memcg_online_kmem(memcg))
5281                 goto remove_id;
5282
5283         /*
5284          * A memcg must be visible for expand_shrinker_info()
5285          * by the time the maps are allocated. So, we allocate maps
5286          * here, when for_each_mem_cgroup() can't skip it.
5287          */
5288         if (alloc_shrinker_info(memcg))
5289                 goto offline_kmem;
5290
5291         /* Online state pins memcg ID, memcg ID pins CSS */
5292         refcount_set(&memcg->id.ref, 1);
5293         css_get(css);
5294
5295         if (unlikely(mem_cgroup_is_root(memcg)))
5296                 queue_delayed_work(system_unbound_wq, &stats_flush_dwork,
5297                                    2UL*HZ);
5298         return 0;
5299 offline_kmem:
5300         memcg_offline_kmem(memcg);
5301 remove_id:
5302         mem_cgroup_id_remove(memcg);
5303         return -ENOMEM;
5304 }
5305
5306 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
5307 {
5308         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5309         struct mem_cgroup_event *event, *tmp;
5310
5311         /*
5312          * Unregister events and notify userspace.
5313          * Notify userspace about cgroup removing only after rmdir of cgroup
5314          * directory to avoid race between userspace and kernelspace.
5315          */
5316         spin_lock_irq(&memcg->event_list_lock);
5317         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
5318                 list_del_init(&event->list);
5319                 schedule_work(&event->remove);
5320         }
5321         spin_unlock_irq(&memcg->event_list_lock);
5322
5323         page_counter_set_min(&memcg->memory, 0);
5324         page_counter_set_low(&memcg->memory, 0);
5325
5326         memcg_offline_kmem(memcg);
5327         reparent_shrinker_deferred(memcg);
5328         wb_memcg_offline(memcg);
5329
5330         drain_all_stock(memcg);
5331
5332         mem_cgroup_id_put(memcg);
5333 }
5334
5335 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
5336 {
5337         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5338
5339         invalidate_reclaim_iterators(memcg);
5340 }
5341
5342 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
5343 {
5344         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5345         int __maybe_unused i;
5346
5347 #ifdef CONFIG_CGROUP_WRITEBACK
5348         for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
5349                 wb_wait_for_completion(&memcg->cgwb_frn[i].done);
5350 #endif
5351         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
5352                 static_branch_dec(&memcg_sockets_enabled_key);
5353
5354         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
5355                 static_branch_dec(&memcg_sockets_enabled_key);
5356
5357         vmpressure_cleanup(&memcg->vmpressure);
5358         cancel_work_sync(&memcg->high_work);
5359         mem_cgroup_remove_from_trees(memcg);
5360         free_shrinker_info(memcg);
5361         mem_cgroup_free(memcg);
5362 }
5363
5364 /**
5365  * mem_cgroup_css_reset - reset the states of a mem_cgroup
5366  * @css: the target css
5367  *
5368  * Reset the states of the mem_cgroup associated with @css.  This is
5369  * invoked when the userland requests disabling on the default hierarchy
5370  * but the memcg is pinned through dependency.  The memcg should stop
5371  * applying policies and should revert to the vanilla state as it may be
5372  * made visible again.
5373  *
5374  * The current implementation only resets the essential configurations.
5375  * This needs to be expanded to cover all the visible parts.
5376  */
5377 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
5378 {
5379         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5380
5381         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
5382         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
5383         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
5384         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
5385         page_counter_set_min(&memcg->memory, 0);
5386         page_counter_set_low(&memcg->memory, 0);
5387         page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
5388         memcg->soft_limit = PAGE_COUNTER_MAX;
5389         page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
5390         memcg_wb_domain_size_changed(memcg);
5391 }
5392
5393 static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
5394 {
5395         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5396         struct mem_cgroup *parent = parent_mem_cgroup(memcg);
5397         struct memcg_vmstats_percpu *statc;
5398         long delta, v;
5399         int i, nid;
5400
5401         statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
5402
5403         for (i = 0; i < MEMCG_NR_STAT; i++) {
5404                 /*
5405                  * Collect the aggregated propagation counts of groups
5406                  * below us. We're in a per-cpu loop here and this is
5407                  * a global counter, so the first cycle will get them.
5408                  */
5409                 delta = memcg->vmstats.state_pending[i];
5410                 if (delta)
5411                         memcg->vmstats.state_pending[i] = 0;
5412
5413                 /* Add CPU changes on this level since the last flush */
5414                 v = READ_ONCE(statc->state[i]);
5415                 if (v != statc->state_prev[i]) {
5416                         delta += v - statc->state_prev[i];
5417                         statc->state_prev[i] = v;
5418                 }
5419
5420                 if (!delta)
5421                         continue;
5422
5423                 /* Aggregate counts on this level and propagate upwards */
5424                 memcg->vmstats.state[i] += delta;
5425                 if (parent)
5426                         parent->vmstats.state_pending[i] += delta;
5427         }
5428
5429         for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
5430                 delta = memcg->vmstats.events_pending[i];
5431                 if (delta)
5432                         memcg->vmstats.events_pending[i] = 0;
5433
5434                 v = READ_ONCE(statc->events[i]);
5435                 if (v != statc->events_prev[i]) {
5436                         delta += v - statc->events_prev[i];
5437                         statc->events_prev[i] = v;
5438                 }
5439
5440                 if (!delta)
5441                         continue;
5442
5443                 memcg->vmstats.events[i] += delta;
5444                 if (parent)
5445                         parent->vmstats.events_pending[i] += delta;
5446         }
5447
5448         for_each_node_state(nid, N_MEMORY) {
5449                 struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5450                 struct mem_cgroup_per_node *ppn = NULL;
5451                 struct lruvec_stats_percpu *lstatc;
5452
5453                 if (parent)
5454                         ppn = parent->nodeinfo[nid];
5455
5456                 lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5457
5458                 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5459                         delta = pn->lruvec_stats.state_pending[i];
5460                         if (delta)
5461                                 pn->lruvec_stats.state_pending[i] = 0;
5462
5463                         v = READ_ONCE(lstatc->state[i]);
5464                         if (v != lstatc->state_prev[i]) {
5465                                 delta += v - lstatc->state_prev[i];
5466                                 lstatc->state_prev[i] = v;
5467                         }
5468
5469                         if (!delta)
5470                                 continue;
5471
5472                         pn->lruvec_stats.state[i] += delta;
5473                         if (ppn)
5474                                 ppn->lruvec_stats.state_pending[i] += delta;
5475                 }
5476         }
5477 }
5478
5479 #ifdef CONFIG_MMU
5480 /* Handlers for move charge at task migration. */
5481 static int mem_cgroup_do_precharge(unsigned long count)
5482 {
5483         int ret;
5484
5485         /* Try a single bulk charge without reclaim first, kswapd may wake */
5486         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
5487         if (!ret) {
5488                 mc.precharge += count;
5489                 return ret;
5490         }
5491
5492         /* Try charges one by one with reclaim, but do not retry */
5493         while (count--) {
5494                 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
5495                 if (ret)
5496                         return ret;
5497                 mc.precharge++;
5498                 cond_resched();
5499         }
5500         return 0;
5501 }
5502
5503 union mc_target {
5504         struct page     *page;
5505         swp_entry_t     ent;
5506 };
5507
5508 enum mc_target_type {
5509         MC_TARGET_NONE = 0,
5510         MC_TARGET_PAGE,
5511         MC_TARGET_SWAP,
5512         MC_TARGET_DEVICE,
5513 };
5514
5515 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
5516                                                 unsigned long addr, pte_t ptent)
5517 {
5518         struct page *page = vm_normal_page(vma, addr, ptent);
5519
5520         if (!page || !page_mapped(page))
5521                 return NULL;
5522         if (PageAnon(page)) {
5523                 if (!(mc.flags & MOVE_ANON))
5524                         return NULL;
5525         } else {
5526                 if (!(mc.flags & MOVE_FILE))
5527                         return NULL;
5528         }
5529         if (!get_page_unless_zero(page))
5530                 return NULL;
5531
5532         return page;
5533 }
5534
5535 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
5536 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5537                         pte_t ptent, swp_entry_t *entry)
5538 {
5539         struct page *page = NULL;
5540         swp_entry_t ent = pte_to_swp_entry(ptent);
5541
5542         if (!(mc.flags & MOVE_ANON))
5543                 return NULL;
5544
5545         /*
5546          * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
5547          * a device and because they are not accessible by CPU they are store
5548          * as special swap entry in the CPU page table.
5549          */
5550         if (is_device_private_entry(ent)) {
5551                 page = pfn_swap_entry_to_page(ent);
5552                 /*
5553                  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
5554                  * a refcount of 1 when free (unlike normal page)
5555                  */
5556                 if (!page_ref_add_unless(page, 1, 1))
5557                         return NULL;
5558                 return page;
5559         }
5560
5561         if (non_swap_entry(ent))
5562                 return NULL;
5563
5564         /*
5565          * Because lookup_swap_cache() updates some statistics counter,
5566          * we call find_get_page() with swapper_space directly.
5567          */
5568         page = find_get_page(swap_address_space(ent), swp_offset(ent));
5569         entry->val = ent.val;
5570
5571         return page;
5572 }
5573 #else
5574 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
5575                         pte_t ptent, swp_entry_t *entry)
5576 {
5577         return NULL;
5578 }
5579 #endif
5580
5581 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
5582                         unsigned long addr, pte_t ptent)
5583 {
5584         if (!vma->vm_file) /* anonymous vma */
5585                 return NULL;
5586         if (!(mc.flags & MOVE_FILE))
5587                 return NULL;
5588
5589         /* page is moved even if it's not RSS of this task(page-faulted). */
5590         /* shmem/tmpfs may report page out on swap: account for that too. */
5591         return find_get_incore_page(vma->vm_file->f_mapping,
5592                         linear_page_index(vma, addr));
5593 }
5594
5595 /**
5596  * mem_cgroup_move_account - move account of the page
5597  * @page: the page
5598  * @compound: charge the page as compound or small page
5599  * @from: mem_cgroup which the page is moved from.
5600  * @to: mem_cgroup which the page is moved to. @from != @to.
5601  *
5602  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
5603  *
5604  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
5605  * from old cgroup.
5606  */
5607 static int mem_cgroup_move_account(struct page *page,
5608                                    bool compound,
5609                                    struct mem_cgroup *from,
5610                                    struct mem_cgroup *to)
5611 {
5612         struct folio *folio = page_folio(page);
5613         struct lruvec *from_vec, *to_vec;
5614         struct pglist_data *pgdat;
5615         unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1;
5616         int nid, ret;
5617
5618         VM_BUG_ON(from == to);
5619         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
5620         VM_BUG_ON(compound && !folio_test_large(folio));
5621
5622         /*
5623          * Prevent mem_cgroup_migrate() from looking at
5624          * page's memory cgroup of its source page while we change it.
5625          */
5626         ret = -EBUSY;
5627         if (!folio_trylock(folio))
5628                 goto out;
5629
5630         ret = -EINVAL;
5631         if (folio_memcg(folio) != from)
5632                 goto out_unlock;
5633
5634         pgdat = folio_pgdat(folio);
5635         from_vec = mem_cgroup_lruvec(from, pgdat);
5636         to_vec = mem_cgroup_lruvec(to, pgdat);
5637
5638         folio_memcg_lock(folio);
5639
5640         if (folio_test_anon(folio)) {
5641                 if (folio_mapped(folio)) {
5642                         __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
5643                         __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
5644                         if (folio_test_transhuge(folio)) {
5645                                 __mod_lruvec_state(from_vec, NR_ANON_THPS,
5646                                                    -nr_pages);
5647                                 __mod_lruvec_state(to_vec, NR_ANON_THPS,
5648                                                    nr_pages);
5649                         }
5650                 }
5651         } else {
5652                 __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
5653                 __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
5654
5655                 if (folio_test_swapbacked(folio)) {
5656                         __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
5657                         __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
5658                 }
5659
5660                 if (folio_mapped(folio)) {
5661                         __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
5662                         __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
5663                 }
5664
5665                 if (folio_test_dirty(folio)) {
5666                         struct address_space *mapping = folio_mapping(folio);
5667
5668                         if (mapping_can_writeback(mapping)) {
5669                                 __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
5670                                                    -nr_pages);
5671                                 __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
5672                                                    nr_pages);
5673                         }
5674                 }
5675         }
5676
5677         if (folio_test_writeback(folio)) {
5678                 __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
5679                 __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
5680         }
5681
5682         /*
5683          * All state has been migrated, let's switch to the new memcg.
5684          *
5685          * It is safe to change page's memcg here because the page
5686          * is referenced, charged, isolated, and locked: we can't race
5687          * with (un)charging, migration, LRU putback, or anything else
5688          * that would rely on a stable page's memory cgroup.
5689          *
5690          * Note that lock_page_memcg is a memcg lock, not a page lock,
5691          * to save space. As soon as we switch page's memory cgroup to a
5692          * new memcg that isn't locked, the above state can change
5693          * concurrently again. Make sure we're truly done with it.
5694          */
5695         smp_mb();
5696
5697         css_get(&to->css);
5698         css_put(&from->css);
5699
5700         folio->memcg_data = (unsigned long)to;
5701
5702         __folio_memcg_unlock(from);
5703
5704         ret = 0;
5705         nid = folio_nid(folio);
5706
5707         local_irq_disable();
5708         mem_cgroup_charge_statistics(to, nr_pages);
5709         memcg_check_events(to, nid);
5710         mem_cgroup_charge_statistics(from, -nr_pages);
5711         memcg_check_events(from, nid);
5712         local_irq_enable();
5713 out_unlock:
5714         folio_unlock(folio);
5715 out:
5716         return ret;
5717 }
5718
5719 /**
5720  * get_mctgt_type - get target type of moving charge
5721  * @vma: the vma the pte to be checked belongs
5722  * @addr: the address corresponding to the pte to be checked
5723  * @ptent: the pte to be checked
5724  * @target: the pointer the target page or swap ent will be stored(can be NULL)
5725  *
5726  * Returns
5727  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
5728  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
5729  *     move charge. if @target is not NULL, the page is stored in target->page
5730  *     with extra refcnt got(Callers should handle it).
5731  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
5732  *     target for charge migration. if @target is not NULL, the entry is stored
5733  *     in target->ent.
5734  *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PRIVATE
5735  *     (so ZONE_DEVICE page and thus not on the lru).
5736  *     For now we such page is charge like a regular page would be as for all
5737  *     intent and purposes it is just special memory taking the place of a
5738  *     regular page.
5739  *
5740  *     See Documentations/vm/hmm.txt and include/linux/hmm.h
5741  *
5742  * Called with pte lock held.
5743  */
5744
5745 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
5746                 unsigned long addr, pte_t ptent, union mc_target *target)
5747 {
5748         struct page *page = NULL;
5749         enum mc_target_type ret = MC_TARGET_NONE;
5750         swp_entry_t ent = { .val = 0 };
5751
5752         if (pte_present(ptent))
5753                 page = mc_handle_present_pte(vma, addr, ptent);
5754         else if (is_swap_pte(ptent))
5755                 page = mc_handle_swap_pte(vma, ptent, &ent);
5756         else if (pte_none(ptent))
5757                 page = mc_handle_file_pte(vma, addr, ptent);
5758
5759         if (!page && !ent.val)
5760                 return ret;
5761         if (page) {
5762                 /*
5763                  * Do only loose check w/o serialization.
5764                  * mem_cgroup_move_account() checks the page is valid or
5765                  * not under LRU exclusion.
5766                  */
5767                 if (page_memcg(page) == mc.from) {
5768                         ret = MC_TARGET_PAGE;
5769                         if (is_device_private_page(page))
5770                                 ret = MC_TARGET_DEVICE;
5771                         if (target)
5772                                 target->page = page;
5773                 }
5774                 if (!ret || !target)
5775                         put_page(page);
5776         }
5777         /*
5778          * There is a swap entry and a page doesn't exist or isn't charged.
5779          * But we cannot move a tail-page in a THP.
5780          */
5781         if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
5782             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
5783                 ret = MC_TARGET_SWAP;
5784                 if (target)
5785                         target->ent = ent;
5786         }
5787         return ret;
5788 }
5789
5790 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
5791 /*
5792  * We don't consider PMD mapped swapping or file mapped pages because THP does
5793  * not support them for now.
5794  * Caller should make sure that pmd_trans_huge(pmd) is true.
5795  */
5796 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5797                 unsigned long addr, pmd_t pmd, union mc_target *target)
5798 {
5799         struct page *page = NULL;
5800         enum mc_target_type ret = MC_TARGET_NONE;
5801
5802         if (unlikely(is_swap_pmd(pmd))) {
5803                 VM_BUG_ON(thp_migration_supported() &&
5804                                   !is_pmd_migration_entry(pmd));
5805                 return ret;
5806         }
5807         page = pmd_page(pmd);
5808         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
5809         if (!(mc.flags & MOVE_ANON))
5810                 return ret;
5811         if (page_memcg(page) == mc.from) {
5812                 ret = MC_TARGET_PAGE;
5813                 if (target) {
5814                         get_page(page);
5815                         target->page = page;
5816                 }
5817         }
5818         return ret;
5819 }
5820 #else
5821 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
5822                 unsigned long addr, pmd_t pmd, union mc_target *target)
5823 {
5824         return MC_TARGET_NONE;
5825 }
5826 #endif
5827
5828 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
5829                                         unsigned long addr, unsigned long end,
5830                                         struct mm_walk *walk)
5831 {
5832         struct vm_area_struct *vma = walk->vma;
5833         pte_t *pte;
5834         spinlock_t *ptl;
5835
5836         ptl = pmd_trans_huge_lock(pmd, vma);
5837         if (ptl) {
5838                 /*
5839                  * Note their can not be MC_TARGET_DEVICE for now as we do not
5840                  * support transparent huge page with MEMORY_DEVICE_PRIVATE but
5841                  * this might change.
5842                  */
5843                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
5844                         mc.precharge += HPAGE_PMD_NR;
5845                 spin_unlock(ptl);
5846                 return 0;
5847         }
5848
5849         if (pmd_trans_unstable(pmd))
5850                 return 0;
5851         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5852         for (; addr != end; pte++, addr += PAGE_SIZE)
5853                 if (get_mctgt_type(vma, addr, *pte, NULL))
5854                         mc.precharge++; /* increment precharge temporarily */
5855         pte_unmap_unlock(pte - 1, ptl);
5856         cond_resched();
5857
5858         return 0;
5859 }
5860
5861 static const struct mm_walk_ops precharge_walk_ops = {
5862         .pmd_entry      = mem_cgroup_count_precharge_pte_range,
5863 };
5864
5865 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
5866 {
5867         unsigned long precharge;
5868
5869         mmap_read_lock(mm);
5870         walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
5871         mmap_read_unlock(mm);
5872
5873         precharge = mc.precharge;
5874         mc.precharge = 0;
5875
5876         return precharge;
5877 }
5878
5879 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5880 {
5881         unsigned long precharge = mem_cgroup_count_precharge(mm);
5882
5883         VM_BUG_ON(mc.moving_task);
5884         mc.moving_task = current;
5885         return mem_cgroup_do_precharge(precharge);
5886 }
5887
5888 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5889 static void __mem_cgroup_clear_mc(void)
5890 {
5891         struct mem_cgroup *from = mc.from;
5892         struct mem_cgroup *to = mc.to;
5893
5894         /* we must uncharge all the leftover precharges from mc.to */
5895         if (mc.precharge) {
5896                 cancel_charge(mc.to, mc.precharge);
5897                 mc.precharge = 0;
5898         }
5899         /*
5900          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5901          * we must uncharge here.
5902          */
5903         if (mc.moved_charge) {
5904                 cancel_charge(mc.from, mc.moved_charge);
5905                 mc.moved_charge = 0;
5906         }
5907         /* we must fixup refcnts and charges */
5908         if (mc.moved_swap) {
5909                 /* uncharge swap account from the old cgroup */
5910                 if (!mem_cgroup_is_root(mc.from))
5911                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5912
5913                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5914
5915                 /*
5916                  * we charged both to->memory and to->memsw, so we
5917                  * should uncharge to->memory.
5918                  */
5919                 if (!mem_cgroup_is_root(mc.to))
5920                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5921
5922                 mc.moved_swap = 0;
5923         }
5924         memcg_oom_recover(from);
5925         memcg_oom_recover(to);
5926         wake_up_all(&mc.waitq);
5927 }
5928
5929 static void mem_cgroup_clear_mc(void)
5930 {
5931         struct mm_struct *mm = mc.mm;
5932
5933         /*
5934          * we must clear moving_task before waking up waiters at the end of
5935          * task migration.
5936          */
5937         mc.moving_task = NULL;
5938         __mem_cgroup_clear_mc();
5939         spin_lock(&mc.lock);
5940         mc.from = NULL;
5941         mc.to = NULL;
5942         mc.mm = NULL;
5943         spin_unlock(&mc.lock);
5944
5945         mmput(mm);
5946 }
5947
5948 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5949 {
5950         struct cgroup_subsys_state *css;
5951         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
5952         struct mem_cgroup *from;
5953         struct task_struct *leader, *p;
5954         struct mm_struct *mm;
5955         unsigned long move_flags;
5956         int ret = 0;
5957
5958         /* charge immigration isn't supported on the default hierarchy */
5959         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5960                 return 0;
5961
5962         /*
5963          * Multi-process migrations only happen on the default hierarchy
5964          * where charge immigration is not used.  Perform charge
5965          * immigration if @tset contains a leader and whine if there are
5966          * multiple.
5967          */
5968         p = NULL;
5969         cgroup_taskset_for_each_leader(leader, css, tset) {
5970                 WARN_ON_ONCE(p);
5971                 p = leader;
5972                 memcg = mem_cgroup_from_css(css);
5973         }
5974         if (!p)
5975                 return 0;
5976
5977         /*
5978          * We are now committed to this value whatever it is. Changes in this
5979          * tunable will only affect upcoming migrations, not the current one.
5980          * So we need to save it, and keep it going.
5981          */
5982         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5983         if (!move_flags)
5984                 return 0;
5985
5986         from = mem_cgroup_from_task(p);
5987
5988         VM_BUG_ON(from == memcg);
5989
5990         mm = get_task_mm(p);
5991         if (!mm)
5992                 return 0;
5993         /* We move charges only when we move a owner of the mm */
5994         if (mm->owner == p) {
5995                 VM_BUG_ON(mc.from);
5996                 VM_BUG_ON(mc.to);
5997                 VM_BUG_ON(mc.precharge);
5998                 VM_BUG_ON(mc.moved_charge);
5999                 VM_BUG_ON(mc.moved_swap);
6000
6001                 spin_lock(&mc.lock);
6002                 mc.mm = mm;
6003                 mc.from = from;
6004                 mc.to = memcg;
6005                 mc.flags = move_flags;
6006                 spin_unlock(&mc.lock);
6007                 /* We set mc.moving_task later */
6008
6009                 ret = mem_cgroup_precharge_mc(mm);
6010                 if (ret)
6011                         mem_cgroup_clear_mc();
6012         } else {
6013                 mmput(mm);
6014         }
6015         return ret;
6016 }
6017
6018 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6019 {
6020         if (mc.to)
6021                 mem_cgroup_clear_mc();
6022 }
6023
6024 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
6025                                 unsigned long addr, unsigned long end,
6026                                 struct mm_walk *walk)
6027 {
6028         int ret = 0;
6029         struct vm_area_struct *vma = walk->vma;
6030         pte_t *pte;
6031         spinlock_t *ptl;
6032         enum mc_target_type target_type;
6033         union mc_target target;
6034         struct page *page;
6035
6036         ptl = pmd_trans_huge_lock(pmd, vma);
6037         if (ptl) {
6038                 if (mc.precharge < HPAGE_PMD_NR) {
6039                         spin_unlock(ptl);
6040                         return 0;
6041                 }
6042                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
6043                 if (target_type == MC_TARGET_PAGE) {
6044                         page = target.page;
6045                         if (!isolate_lru_page(page)) {
6046                                 if (!mem_cgroup_move_account(page, true,
6047                                                              mc.from, mc.to)) {
6048                                         mc.precharge -= HPAGE_PMD_NR;
6049                                         mc.moved_charge += HPAGE_PMD_NR;
6050                                 }
6051                                 putback_lru_page(page);
6052                         }
6053                         put_page(page);
6054                 } else if (target_type == MC_TARGET_DEVICE) {
6055                         page = target.page;
6056                         if (!mem_cgroup_move_account(page, true,
6057                                                      mc.from, mc.to)) {
6058                                 mc.precharge -= HPAGE_PMD_NR;
6059                                 mc.moved_charge += HPAGE_PMD_NR;
6060                         }
6061                         put_page(page);
6062                 }
6063                 spin_unlock(ptl);
6064                 return 0;
6065         }
6066
6067         if (pmd_trans_unstable(pmd))
6068                 return 0;
6069 retry:
6070         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
6071         for (; addr != end; addr += PAGE_SIZE) {
6072                 pte_t ptent = *(pte++);
6073                 bool device = false;
6074                 swp_entry_t ent;
6075
6076                 if (!mc.precharge)
6077                         break;
6078
6079                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
6080                 case MC_TARGET_DEVICE:
6081                         device = true;
6082                         fallthrough;
6083                 case MC_TARGET_PAGE:
6084                         page = target.page;
6085                         /*
6086                          * We can have a part of the split pmd here. Moving it
6087                          * can be done but it would be too convoluted so simply
6088                          * ignore such a partial THP and keep it in original
6089                          * memcg. There should be somebody mapping the head.
6090                          */
6091                         if (PageTransCompound(page))
6092                                 goto put;
6093                         if (!device && isolate_lru_page(page))
6094                                 goto put;
6095                         if (!mem_cgroup_move_account(page, false,
6096                                                 mc.from, mc.to)) {
6097                                 mc.precharge--;
6098                                 /* we uncharge from mc.from later. */
6099                                 mc.moved_charge++;
6100                         }
6101                         if (!device)
6102                                 putback_lru_page(page);
6103 put:                    /* get_mctgt_type() gets the page */
6104                         put_page(page);
6105                         break;
6106                 case MC_TARGET_SWAP:
6107                         ent = target.ent;
6108                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
6109                                 mc.precharge--;
6110                                 mem_cgroup_id_get_many(mc.to, 1);
6111                                 /* we fixup other refcnts and charges later. */
6112                                 mc.moved_swap++;
6113                         }
6114                         break;
6115                 default:
6116                         break;
6117                 }
6118         }
6119         pte_unmap_unlock(pte - 1, ptl);
6120         cond_resched();
6121
6122         if (addr != end) {
6123                 /*
6124                  * We have consumed all precharges we got in can_attach().
6125                  * We try charge one by one, but don't do any additional
6126                  * charges to mc.to if we have failed in charge once in attach()
6127                  * phase.
6128                  */
6129                 ret = mem_cgroup_do_precharge(1);
6130                 if (!ret)
6131                         goto retry;
6132         }
6133
6134         return ret;
6135 }
6136
6137 static const struct mm_walk_ops charge_walk_ops = {
6138         .pmd_entry      = mem_cgroup_move_charge_pte_range,
6139 };
6140
6141 static void mem_cgroup_move_charge(void)
6142 {
6143         lru_add_drain_all();
6144         /*
6145          * Signal lock_page_memcg() to take the memcg's move_lock
6146          * while we're moving its pages to another memcg. Then wait
6147          * for already started RCU-only updates to finish.
6148          */
6149         atomic_inc(&mc.from->moving_account);
6150         synchronize_rcu();
6151 retry:
6152         if (unlikely(!mmap_read_trylock(mc.mm))) {
6153                 /*
6154                  * Someone who are holding the mmap_lock might be waiting in
6155                  * waitq. So we cancel all extra charges, wake up all waiters,
6156                  * and retry. Because we cancel precharges, we might not be able
6157                  * to move enough charges, but moving charge is a best-effort
6158                  * feature anyway, so it wouldn't be a big problem.
6159                  */
6160                 __mem_cgroup_clear_mc();
6161                 cond_resched();
6162                 goto retry;
6163         }
6164         /*
6165          * When we have consumed all precharges and failed in doing
6166          * additional charge, the page walk just aborts.
6167          */
6168         walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
6169                         NULL);
6170
6171         mmap_read_unlock(mc.mm);
6172         atomic_dec(&mc.from->moving_account);
6173 }
6174
6175 static void mem_cgroup_move_task(void)
6176 {
6177         if (mc.to) {
6178                 mem_cgroup_move_charge();
6179                 mem_cgroup_clear_mc();
6180         }
6181 }
6182 #else   /* !CONFIG_MMU */
6183 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
6184 {
6185         return 0;
6186 }
6187 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
6188 {
6189 }
6190 static void mem_cgroup_move_task(void)
6191 {
6192 }
6193 #endif
6194
6195 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
6196 {
6197         if (value == PAGE_COUNTER_MAX)
6198                 seq_puts(m, "max\n");
6199         else
6200                 seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
6201
6202         return 0;
6203 }
6204
6205 static u64 memory_current_read(struct cgroup_subsys_state *css,
6206                                struct cftype *cft)
6207 {
6208         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6209
6210         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
6211 }
6212
6213 static int memory_min_show(struct seq_file *m, void *v)
6214 {
6215         return seq_puts_memcg_tunable(m,
6216                 READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
6217 }
6218
6219 static ssize_t memory_min_write(struct kernfs_open_file *of,
6220                                 char *buf, size_t nbytes, loff_t off)
6221 {
6222         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6223         unsigned long min;
6224         int err;
6225
6226         buf = strstrip(buf);
6227         err = page_counter_memparse(buf, "max", &min);
6228         if (err)
6229                 return err;
6230
6231         page_counter_set_min(&memcg->memory, min);
6232
6233         return nbytes;
6234 }
6235
6236 static int memory_low_show(struct seq_file *m, void *v)
6237 {
6238         return seq_puts_memcg_tunable(m,
6239                 READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
6240 }
6241
6242 static ssize_t memory_low_write(struct kernfs_open_file *of,
6243                                 char *buf, size_t nbytes, loff_t off)
6244 {
6245         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6246         unsigned long low;
6247         int err;
6248
6249         buf = strstrip(buf);
6250         err = page_counter_memparse(buf, "max", &low);
6251         if (err)
6252                 return err;
6253
6254         page_counter_set_low(&memcg->memory, low);
6255
6256         return nbytes;
6257 }
6258
6259 static int memory_high_show(struct seq_file *m, void *v)
6260 {
6261         return seq_puts_memcg_tunable(m,
6262                 READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
6263 }
6264
6265 static ssize_t memory_high_write(struct kernfs_open_file *of,
6266                                  char *buf, size_t nbytes, loff_t off)
6267 {
6268         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6269         unsigned int nr_retries = MAX_RECLAIM_RETRIES;
6270         bool drained = false;
6271         unsigned long high;
6272         int err;
6273
6274         buf = strstrip(buf);
6275         err = page_counter_memparse(buf, "max", &high);
6276         if (err)
6277                 return err;
6278
6279         page_counter_set_high(&memcg->memory, high);
6280
6281         for (;;) {
6282                 unsigned long nr_pages = page_counter_read(&memcg->memory);
6283                 unsigned long reclaimed;
6284
6285                 if (nr_pages <= high)
6286                         break;
6287
6288                 if (signal_pending(current))
6289                         break;
6290
6291                 if (!drained) {
6292                         drain_all_stock(memcg);
6293                         drained = true;
6294                         continue;
6295                 }
6296
6297                 reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
6298                                                          GFP_KERNEL, true);
6299
6300                 if (!reclaimed && !nr_retries--)
6301                         break;
6302         }
6303
6304         memcg_wb_domain_size_changed(memcg);
6305         return nbytes;
6306 }
6307
6308 static int memory_max_show(struct seq_file *m, void *v)
6309 {
6310         return seq_puts_memcg_tunable(m,
6311                 READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
6312 }
6313
6314 static ssize_t memory_max_write(struct kernfs_open_file *of,
6315                                 char *buf, size_t nbytes, loff_t off)
6316 {
6317         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6318         unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
6319         bool drained = false;
6320         unsigned long max;
6321         int err;
6322
6323         buf = strstrip(buf);
6324         err = page_counter_memparse(buf, "max", &max);
6325         if (err)
6326                 return err;
6327
6328         xchg(&memcg->memory.max, max);
6329
6330         for (;;) {
6331                 unsigned long nr_pages = page_counter_read(&memcg->memory);
6332
6333                 if (nr_pages <= max)
6334                         break;
6335
6336                 if (signal_pending(current))
6337                         break;
6338
6339                 if (!drained) {
6340                         drain_all_stock(memcg);
6341                         drained = true;
6342                         continue;
6343                 }
6344
6345                 if (nr_reclaims) {
6346                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
6347                                                           GFP_KERNEL, true))
6348                                 nr_reclaims--;
6349                         continue;
6350                 }
6351
6352                 memcg_memory_event(memcg, MEMCG_OOM);
6353                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
6354                         break;
6355         }
6356
6357         memcg_wb_domain_size_changed(memcg);
6358         return nbytes;
6359 }
6360
6361 static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
6362 {
6363         seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
6364         seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
6365         seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
6366         seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
6367         seq_printf(m, "oom_kill %lu\n",
6368                    atomic_long_read(&events[MEMCG_OOM_KILL]));
6369         seq_printf(m, "oom_group_kill %lu\n",
6370                    atomic_long_read(&events[MEMCG_OOM_GROUP_KILL]));
6371 }
6372
6373 static int memory_events_show(struct seq_file *m, void *v)
6374 {
6375         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6376
6377         __memory_events_show(m, memcg->memory_events);
6378         return 0;
6379 }
6380
6381 static int memory_events_local_show(struct seq_file *m, void *v)
6382 {
6383         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6384
6385         __memory_events_show(m, memcg->memory_events_local);
6386         return 0;
6387 }
6388
6389 static int memory_stat_show(struct seq_file *m, void *v)
6390 {
6391         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6392         char *buf;
6393
6394         buf = memory_stat_format(memcg);
6395         if (!buf)
6396                 return -ENOMEM;
6397         seq_puts(m, buf);
6398         kfree(buf);
6399         return 0;
6400 }
6401
6402 #ifdef CONFIG_NUMA
6403 static inline unsigned long lruvec_page_state_output(struct lruvec *lruvec,
6404                                                      int item)
6405 {
6406         return lruvec_page_state(lruvec, item) * memcg_page_state_unit(item);
6407 }
6408
6409 static int memory_numa_stat_show(struct seq_file *m, void *v)
6410 {
6411         int i;
6412         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6413
6414         mem_cgroup_flush_stats();
6415
6416         for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
6417                 int nid;
6418
6419                 if (memory_stats[i].idx >= NR_VM_NODE_STAT_ITEMS)
6420                         continue;
6421
6422                 seq_printf(m, "%s", memory_stats[i].name);
6423                 for_each_node_state(nid, N_MEMORY) {
6424                         u64 size;
6425                         struct lruvec *lruvec;
6426
6427                         lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
6428                         size = lruvec_page_state_output(lruvec,
6429                                                         memory_stats[i].idx);
6430                         seq_printf(m, " N%d=%llu", nid, size);
6431                 }
6432                 seq_putc(m, '\n');
6433         }
6434
6435         return 0;
6436 }
6437 #endif
6438
6439 static int memory_oom_group_show(struct seq_file *m, void *v)
6440 {
6441         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
6442
6443         seq_printf(m, "%d\n", memcg->oom_group);
6444
6445         return 0;
6446 }
6447
6448 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
6449                                       char *buf, size_t nbytes, loff_t off)
6450 {
6451         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6452         int ret, oom_group;
6453
6454         buf = strstrip(buf);
6455         if (!buf)
6456                 return -EINVAL;
6457
6458         ret = kstrtoint(buf, 0, &oom_group);
6459         if (ret)
6460                 return ret;
6461
6462         if (oom_group != 0 && oom_group != 1)
6463                 return -EINVAL;
6464
6465         memcg->oom_group = oom_group;
6466
6467         return nbytes;
6468 }
6469
6470 static struct cftype memory_files[] = {
6471         {
6472                 .name = "current",
6473                 .flags = CFTYPE_NOT_ON_ROOT,
6474                 .read_u64 = memory_current_read,
6475         },
6476         {
6477                 .name = "min",
6478                 .flags = CFTYPE_NOT_ON_ROOT,
6479                 .seq_show = memory_min_show,
6480                 .write = memory_min_write,
6481         },
6482         {
6483                 .name = "low",
6484                 .flags = CFTYPE_NOT_ON_ROOT,
6485                 .seq_show = memory_low_show,
6486                 .write = memory_low_write,
6487         },
6488         {
6489                 .name = "high",
6490                 .flags = CFTYPE_NOT_ON_ROOT,
6491                 .seq_show = memory_high_show,
6492                 .write = memory_high_write,
6493         },
6494         {
6495                 .name = "max",
6496                 .flags = CFTYPE_NOT_ON_ROOT,
6497                 .seq_show = memory_max_show,
6498                 .write = memory_max_write,
6499         },
6500         {
6501                 .name = "events",
6502                 .flags = CFTYPE_NOT_ON_ROOT,
6503                 .file_offset = offsetof(struct mem_cgroup, events_file),
6504                 .seq_show = memory_events_show,
6505         },
6506         {
6507                 .name = "events.local",
6508                 .flags = CFTYPE_NOT_ON_ROOT,
6509                 .file_offset = offsetof(struct mem_cgroup, events_local_file),
6510                 .seq_show = memory_events_local_show,
6511         },
6512         {
6513                 .name = "stat",
6514                 .seq_show = memory_stat_show,
6515         },
6516 #ifdef CONFIG_NUMA
6517         {
6518                 .name = "numa_stat",
6519                 .seq_show = memory_numa_stat_show,
6520         },
6521 #endif
6522         {
6523                 .name = "oom.group",
6524                 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
6525                 .seq_show = memory_oom_group_show,
6526                 .write = memory_oom_group_write,
6527         },
6528         { }     /* terminate */
6529 };
6530
6531 struct cgroup_subsys memory_cgrp_subsys = {
6532         .css_alloc = mem_cgroup_css_alloc,
6533         .css_online = mem_cgroup_css_online,
6534         .css_offline = mem_cgroup_css_offline,
6535         .css_released = mem_cgroup_css_released,
6536         .css_free = mem_cgroup_css_free,
6537         .css_reset = mem_cgroup_css_reset,
6538         .css_rstat_flush = mem_cgroup_css_rstat_flush,
6539         .can_attach = mem_cgroup_can_attach,
6540         .cancel_attach = mem_cgroup_cancel_attach,
6541         .post_attach = mem_cgroup_move_task,
6542         .dfl_cftypes = memory_files,
6543         .legacy_cftypes = mem_cgroup_legacy_files,
6544         .early_init = 0,
6545 };
6546
6547 /*
6548  * This function calculates an individual cgroup's effective
6549  * protection which is derived from its own memory.min/low, its
6550  * parent's and siblings' settings, as well as the actual memory
6551  * distribution in the tree.
6552  *
6553  * The following rules apply to the effective protection values:
6554  *
6555  * 1. At the first level of reclaim, effective protection is equal to
6556  *    the declared protection in memory.min and memory.low.
6557  *
6558  * 2. To enable safe delegation of the protection configuration, at
6559  *    subsequent levels the effective protection is capped to the
6560  *    parent's effective protection.
6561  *
6562  * 3. To make complex and dynamic subtrees easier to configure, the
6563  *    user is allowed to overcommit the declared protection at a given
6564  *    level. If that is the case, the parent's effective protection is
6565  *    distributed to the children in proportion to how much protection
6566  *    they have declared and how much of it they are utilizing.
6567  *
6568  *    This makes distribution proportional, but also work-conserving:
6569  *    if one cgroup claims much more protection than it uses memory,
6570  *    the unused remainder is available to its siblings.
6571  *
6572  * 4. Conversely, when the declared protection is undercommitted at a
6573  *    given level, the distribution of the larger parental protection
6574  *    budget is NOT proportional. A cgroup's protection from a sibling
6575  *    is capped to its own memory.min/low setting.
6576  *
6577  * 5. However, to allow protecting recursive subtrees from each other
6578  *    without having to declare each individual cgroup's fixed share
6579  *    of the ancestor's claim to protection, any unutilized -
6580  *    "floating" - protection from up the tree is distributed in
6581  *    proportion to each cgroup's *usage*. This makes the protection
6582  *    neutral wrt sibling cgroups and lets them compete freely over
6583  *    the shared parental protection budget, but it protects the
6584  *    subtree as a whole from neighboring subtrees.
6585  *
6586  * Note that 4. and 5. are not in conflict: 4. is about protecting
6587  * against immediate siblings whereas 5. is about protecting against
6588  * neighboring subtrees.
6589  */
6590 static unsigned long effective_protection(unsigned long usage,
6591                                           unsigned long parent_usage,
6592                                           unsigned long setting,
6593                                           unsigned long parent_effective,
6594                                           unsigned long siblings_protected)
6595 {
6596         unsigned long protected;
6597         unsigned long ep;
6598
6599         protected = min(usage, setting);
6600         /*
6601          * If all cgroups at this level combined claim and use more
6602          * protection then what the parent affords them, distribute
6603          * shares in proportion to utilization.
6604          *
6605          * We are using actual utilization rather than the statically
6606          * claimed protection in order to be work-conserving: claimed
6607          * but unused protection is available to siblings that would
6608          * otherwise get a smaller chunk than what they claimed.
6609          */
6610         if (siblings_protected > parent_effective)
6611                 return protected * parent_effective / siblings_protected;
6612
6613         /*
6614          * Ok, utilized protection of all children is within what the
6615          * parent affords them, so we know whatever this child claims
6616          * and utilizes is effectively protected.
6617          *
6618          * If there is unprotected usage beyond this value, reclaim
6619          * will apply pressure in proportion to that amount.
6620          *
6621          * If there is unutilized protection, the cgroup will be fully
6622          * shielded from reclaim, but we do return a smaller value for
6623          * protection than what the group could enjoy in theory. This
6624          * is okay. With the overcommit distribution above, effective
6625          * protection is always dependent on how memory is actually
6626          * consumed among the siblings anyway.
6627          */
6628         ep = protected;
6629
6630         /*
6631          * If the children aren't claiming (all of) the protection
6632          * afforded to them by the parent, distribute the remainder in
6633          * proportion to the (unprotected) memory of each cgroup. That
6634          * way, cgroups that aren't explicitly prioritized wrt each
6635          * other compete freely over the allowance, but they are
6636          * collectively protected from neighboring trees.
6637          *
6638          * We're using unprotected memory for the weight so that if
6639          * some cgroups DO claim explicit protection, we don't protect
6640          * the same bytes twice.
6641          *
6642          * Check both usage and parent_usage against the respective
6643          * protected values. One should imply the other, but they
6644          * aren't read atomically - make sure the division is sane.
6645          */
6646         if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
6647                 return ep;
6648         if (parent_effective > siblings_protected &&
6649             parent_usage > siblings_protected &&
6650             usage > protected) {
6651                 unsigned long unclaimed;
6652
6653                 unclaimed = parent_effective - siblings_protected;
6654                 unclaimed *= usage - protected;
6655                 unclaimed /= parent_usage - siblings_protected;
6656
6657                 ep += unclaimed;
6658         }
6659
6660         return ep;
6661 }
6662
6663 /**
6664  * mem_cgroup_calculate_protection - check if memory consumption is in the normal range
6665  * @root: the top ancestor of the sub-tree being checked
6666  * @memcg: the memory cgroup to check
6667  *
6668  * WARNING: This function is not stateless! It can only be used as part
6669  *          of a top-down tree iteration, not for isolated queries.
6670  */
6671 void mem_cgroup_calculate_protection(struct mem_cgroup *root,
6672                                      struct mem_cgroup *memcg)
6673 {
6674         unsigned long usage, parent_usage;
6675         struct mem_cgroup *parent;
6676
6677         if (mem_cgroup_disabled())
6678                 return;
6679
6680         if (!root)
6681                 root = root_mem_cgroup;
6682
6683         /*
6684          * Effective values of the reclaim targets are ignored so they
6685          * can be stale. Have a look at mem_cgroup_protection for more
6686          * details.
6687          * TODO: calculation should be more robust so that we do not need
6688          * that special casing.
6689          */
6690         if (memcg == root)
6691                 return;
6692
6693         usage = page_counter_read(&memcg->memory);
6694         if (!usage)
6695                 return;
6696
6697         parent = parent_mem_cgroup(memcg);
6698         /* No parent means a non-hierarchical mode on v1 memcg */
6699         if (!parent)
6700                 return;
6701
6702         if (parent == root) {
6703                 memcg->memory.emin = READ_ONCE(memcg->memory.min);
6704                 memcg->memory.elow = READ_ONCE(memcg->memory.low);
6705                 return;
6706         }
6707
6708         parent_usage = page_counter_read(&parent->memory);
6709
6710         WRITE_ONCE(memcg->memory.emin, effective_protection(usage, parent_usage,
6711                         READ_ONCE(memcg->memory.min),
6712                         READ_ONCE(parent->memory.emin),
6713                         atomic_long_read(&parent->memory.children_min_usage)));
6714
6715         WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
6716                         READ_ONCE(memcg->memory.low),
6717                         READ_ONCE(parent->memory.elow),
6718                         atomic_long_read(&parent->memory.children_low_usage)));
6719 }
6720
6721 static int charge_memcg(struct folio *folio, struct mem_cgroup *memcg,
6722                         gfp_t gfp)
6723 {
6724         long nr_pages = folio_nr_pages(folio);
6725         int ret;
6726
6727         ret = try_charge(memcg, gfp, nr_pages);
6728         if (ret)
6729                 goto out;
6730
6731         css_get(&memcg->css);
6732         commit_charge(folio, memcg);
6733
6734         local_irq_disable();
6735         mem_cgroup_charge_statistics(memcg, nr_pages);
6736         memcg_check_events(memcg, folio_nid(folio));
6737         local_irq_enable();
6738 out:
6739         return ret;
6740 }
6741
6742 int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp)
6743 {
6744         struct mem_cgroup *memcg;
6745         int ret;
6746
6747         memcg = get_mem_cgroup_from_mm(mm);
6748         ret = charge_memcg(folio, memcg, gfp);
6749         css_put(&memcg->css);
6750
6751         return ret;
6752 }
6753
6754 /**
6755  * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
6756  * @page: page to charge
6757  * @mm: mm context of the victim
6758  * @gfp: reclaim mode
6759  * @entry: swap entry for which the page is allocated
6760  *
6761  * This function charges a page allocated for swapin. Please call this before
6762  * adding the page to the swapcache.
6763  *
6764  * Returns 0 on success. Otherwise, an error code is returned.
6765  */
6766 int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
6767                                   gfp_t gfp, swp_entry_t entry)
6768 {
6769         struct folio *folio = page_folio(page);
6770         struct mem_cgroup *memcg;
6771         unsigned short id;
6772         int ret;
6773
6774         if (mem_cgroup_disabled())
6775                 return 0;
6776
6777         id = lookup_swap_cgroup_id(entry);
6778         rcu_read_lock();
6779         memcg = mem_cgroup_from_id(id);
6780         if (!memcg || !css_tryget_online(&memcg->css))
6781                 memcg = get_mem_cgroup_from_mm(mm);
6782         rcu_read_unlock();
6783
6784         ret = charge_memcg(folio, memcg, gfp);
6785
6786         css_put(&memcg->css);
6787         return ret;
6788 }
6789
6790 /*
6791  * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
6792  * @entry: swap entry for which the page is charged
6793  *
6794  * Call this function after successfully adding the charged page to swapcache.
6795  *
6796  * Note: This function assumes the page for which swap slot is being uncharged
6797  * is order 0 page.
6798  */
6799 void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
6800 {
6801         /*
6802          * Cgroup1's unified memory+swap counter has been charged with the
6803          * new swapcache page, finish the transfer by uncharging the swap
6804          * slot. The swap slot would also get uncharged when it dies, but
6805          * it can stick around indefinitely and we'd count the page twice
6806          * the entire time.
6807          *
6808          * Cgroup2 has separate resource counters for memory and swap,
6809          * so this is a non-issue here. Memory and swap charge lifetimes
6810          * correspond 1:1 to page and swap slot lifetimes: we charge the
6811          * page to memory here, and uncharge swap when the slot is freed.
6812          */
6813         if (!mem_cgroup_disabled() && do_memsw_account()) {
6814                 /*
6815                  * The swap entry might not get freed for a long time,
6816                  * let's not wait for it.  The page already received a
6817                  * memory+swap charge, drop the swap entry duplicate.
6818                  */
6819                 mem_cgroup_uncharge_swap(entry, 1);
6820         }
6821 }
6822
6823 struct uncharge_gather {
6824         struct mem_cgroup *memcg;
6825         unsigned long nr_memory;
6826         unsigned long pgpgout;
6827         unsigned long nr_kmem;
6828         int nid;
6829 };
6830
6831 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6832 {
6833         memset(ug, 0, sizeof(*ug));
6834 }
6835
6836 static void uncharge_batch(const struct uncharge_gather *ug)
6837 {
6838         unsigned long flags;
6839
6840         if (ug->nr_memory) {
6841                 page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
6842                 if (do_memsw_account())
6843                         page_counter_uncharge(&ug->memcg->memsw, ug->nr_memory);
6844                 if (ug->nr_kmem)
6845                         memcg_account_kmem(ug->memcg, -ug->nr_kmem);
6846                 memcg_oom_recover(ug->memcg);
6847         }
6848
6849         local_irq_save(flags);
6850         __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6851         __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
6852         memcg_check_events(ug->memcg, ug->nid);
6853         local_irq_restore(flags);
6854
6855         /* drop reference from uncharge_folio */
6856         css_put(&ug->memcg->css);
6857 }
6858
6859 static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
6860 {
6861         long nr_pages;
6862         struct mem_cgroup *memcg;
6863         struct obj_cgroup *objcg;
6864
6865         VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
6866
6867         /*
6868          * Nobody should be changing or seriously looking at
6869          * folio memcg or objcg at this point, we have fully
6870          * exclusive access to the folio.
6871          */
6872         if (folio_memcg_kmem(folio)) {
6873                 objcg = __folio_objcg(folio);
6874                 /*
6875                  * This get matches the put at the end of the function and
6876                  * kmem pages do not hold memcg references anymore.
6877                  */
6878                 memcg = get_mem_cgroup_from_objcg(objcg);
6879         } else {
6880                 memcg = __folio_memcg(folio);
6881         }
6882
6883         if (!memcg)
6884                 return;
6885
6886         if (ug->memcg != memcg) {
6887                 if (ug->memcg) {
6888                         uncharge_batch(ug);
6889                         uncharge_gather_clear(ug);
6890                 }
6891                 ug->memcg = memcg;
6892                 ug->nid = folio_nid(folio);
6893
6894                 /* pairs with css_put in uncharge_batch */
6895                 css_get(&memcg->css);
6896         }
6897
6898         nr_pages = folio_nr_pages(folio);
6899
6900         if (folio_memcg_kmem(folio)) {
6901                 ug->nr_memory += nr_pages;
6902                 ug->nr_kmem += nr_pages;
6903
6904                 folio->memcg_data = 0;
6905                 obj_cgroup_put(objcg);
6906         } else {
6907                 /* LRU pages aren't accounted at the root level */
6908                 if (!mem_cgroup_is_root(memcg))
6909                         ug->nr_memory += nr_pages;
6910                 ug->pgpgout++;
6911
6912                 folio->memcg_data = 0;
6913         }
6914
6915         css_put(&memcg->css);
6916 }
6917
6918 void __mem_cgroup_uncharge(struct folio *folio)
6919 {
6920         struct uncharge_gather ug;
6921
6922         /* Don't touch folio->lru of any random page, pre-check: */
6923         if (!folio_memcg(folio))
6924                 return;
6925
6926         uncharge_gather_clear(&ug);
6927         uncharge_folio(folio, &ug);
6928         uncharge_batch(&ug);
6929 }
6930
6931 /**
6932  * __mem_cgroup_uncharge_list - uncharge a list of page
6933  * @page_list: list of pages to uncharge
6934  *
6935  * Uncharge a list of pages previously charged with
6936  * __mem_cgroup_charge().
6937  */
6938 void __mem_cgroup_uncharge_list(struct list_head *page_list)
6939 {
6940         struct uncharge_gather ug;
6941         struct folio *folio;
6942
6943         uncharge_gather_clear(&ug);
6944         list_for_each_entry(folio, page_list, lru)
6945                 uncharge_folio(folio, &ug);
6946         if (ug.memcg)
6947                 uncharge_batch(&ug);
6948 }
6949
6950 /**
6951  * mem_cgroup_migrate - Charge a folio's replacement.
6952  * @old: Currently circulating folio.
6953  * @new: Replacement folio.
6954  *
6955  * Charge @new as a replacement folio for @old. @old will
6956  * be uncharged upon free.
6957  *
6958  * Both folios must be locked, @new->mapping must be set up.
6959  */
6960 void mem_cgroup_migrate(struct folio *old, struct folio *new)
6961 {
6962         struct mem_cgroup *memcg;
6963         long nr_pages = folio_nr_pages(new);
6964         unsigned long flags;
6965
6966         VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
6967         VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
6968         VM_BUG_ON_FOLIO(folio_test_anon(old) != folio_test_anon(new), new);
6969         VM_BUG_ON_FOLIO(folio_nr_pages(old) != nr_pages, new);
6970
6971         if (mem_cgroup_disabled())
6972                 return;
6973
6974         /* Page cache replacement: new folio already charged? */
6975         if (folio_memcg(new))
6976                 return;
6977
6978         memcg = folio_memcg(old);
6979         VM_WARN_ON_ONCE_FOLIO(!memcg, old);
6980         if (!memcg)
6981                 return;
6982
6983         /* Force-charge the new page. The old one will be freed soon */
6984         if (!mem_cgroup_is_root(memcg)) {
6985                 page_counter_charge(&memcg->memory, nr_pages);
6986                 if (do_memsw_account())
6987                         page_counter_charge(&memcg->memsw, nr_pages);
6988         }
6989
6990         css_get(&memcg->css);
6991         commit_charge(new, memcg);
6992
6993         local_irq_save(flags);
6994         mem_cgroup_charge_statistics(memcg, nr_pages);
6995         memcg_check_events(memcg, folio_nid(new));
6996         local_irq_restore(flags);
6997 }
6998
6999 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
7000 EXPORT_SYMBOL(memcg_sockets_enabled_key);
7001
7002 void mem_cgroup_sk_alloc(struct sock *sk)
7003 {
7004         struct mem_cgroup *memcg;
7005
7006         if (!mem_cgroup_sockets_enabled)
7007                 return;
7008
7009         /* Do not associate the sock with unrelated interrupted task's memcg. */
7010         if (!in_task())
7011                 return;
7012
7013         rcu_read_lock();
7014         memcg = mem_cgroup_from_task(current);
7015         if (memcg == root_mem_cgroup)
7016                 goto out;
7017         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
7018                 goto out;
7019         if (css_tryget(&memcg->css))
7020                 sk->sk_memcg = memcg;
7021 out:
7022         rcu_read_unlock();
7023 }
7024
7025 void mem_cgroup_sk_free(struct sock *sk)
7026 {
7027         if (sk->sk_memcg)
7028                 css_put(&sk->sk_memcg->css);
7029 }
7030
7031 /**
7032  * mem_cgroup_charge_skmem - charge socket memory
7033  * @memcg: memcg to charge
7034  * @nr_pages: number of pages to charge
7035  * @gfp_mask: reclaim mode
7036  *
7037  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
7038  * @memcg's configured limit, %false if it doesn't.
7039  */
7040 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
7041                              gfp_t gfp_mask)
7042 {
7043         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7044                 struct page_counter *fail;
7045
7046                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
7047                         memcg->tcpmem_pressure = 0;
7048                         return true;
7049                 }
7050                 memcg->tcpmem_pressure = 1;
7051                 if (gfp_mask & __GFP_NOFAIL) {
7052                         page_counter_charge(&memcg->tcpmem, nr_pages);
7053                         return true;
7054                 }
7055                 return false;
7056         }
7057
7058         if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
7059                 mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
7060                 return true;
7061         }
7062
7063         return false;
7064 }
7065
7066 /**
7067  * mem_cgroup_uncharge_skmem - uncharge socket memory
7068  * @memcg: memcg to uncharge
7069  * @nr_pages: number of pages to uncharge
7070  */
7071 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
7072 {
7073         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
7074                 page_counter_uncharge(&memcg->tcpmem, nr_pages);
7075                 return;
7076         }
7077
7078         mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages);
7079
7080         refill_stock(memcg, nr_pages);
7081 }
7082
7083 static int __init cgroup_memory(char *s)
7084 {
7085         char *token;
7086
7087         while ((token = strsep(&s, ",")) != NULL) {
7088                 if (!*token)
7089                         continue;
7090                 if (!strcmp(token, "nosocket"))
7091                         cgroup_memory_nosocket = true;
7092                 if (!strcmp(token, "nokmem"))
7093                         cgroup_memory_nokmem = true;
7094         }
7095         return 1;
7096 }
7097 __setup("cgroup.memory=", cgroup_memory);
7098
7099 /*
7100  * subsys_initcall() for memory controller.
7101  *
7102  * Some parts like memcg_hotplug_cpu_dead() have to be initialized from this
7103  * context because of lock dependencies (cgroup_lock -> cpu hotplug) but
7104  * basically everything that doesn't depend on a specific mem_cgroup structure
7105  * should be initialized from here.
7106  */
7107 static int __init mem_cgroup_init(void)
7108 {
7109         int cpu, node;
7110
7111         /*
7112          * Currently s32 type (can refer to struct batched_lruvec_stat) is
7113          * used for per-memcg-per-cpu caching of per-node statistics. In order
7114          * to work fine, we should make sure that the overfill threshold can't
7115          * exceed S32_MAX / PAGE_SIZE.
7116          */
7117         BUILD_BUG_ON(MEMCG_CHARGE_BATCH > S32_MAX / PAGE_SIZE);
7118
7119         cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
7120                                   memcg_hotplug_cpu_dead);
7121
7122         for_each_possible_cpu(cpu)
7123                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
7124                           drain_local_stock);
7125
7126         for_each_node(node) {
7127                 struct mem_cgroup_tree_per_node *rtpn;
7128
7129                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
7130                                     node_online(node) ? node : NUMA_NO_NODE);
7131
7132                 rtpn->rb_root = RB_ROOT;
7133                 rtpn->rb_rightmost = NULL;
7134                 spin_lock_init(&rtpn->lock);
7135                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
7136         }
7137
7138         return 0;
7139 }
7140 subsys_initcall(mem_cgroup_init);
7141
7142 #ifdef CONFIG_MEMCG_SWAP
7143 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
7144 {
7145         while (!refcount_inc_not_zero(&memcg->id.ref)) {
7146                 /*
7147                  * The root cgroup cannot be destroyed, so it's refcount must
7148                  * always be >= 1.
7149                  */
7150                 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
7151                         VM_BUG_ON(1);
7152                         break;
7153                 }
7154                 memcg = parent_mem_cgroup(memcg);
7155                 if (!memcg)
7156                         memcg = root_mem_cgroup;
7157         }
7158         return memcg;
7159 }
7160
7161 /**
7162  * mem_cgroup_swapout - transfer a memsw charge to swap
7163  * @page: page whose memsw charge to transfer
7164  * @entry: swap entry to move the charge to
7165  *
7166  * Transfer the memsw charge of @page to @entry.
7167  */
7168 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
7169 {
7170         struct mem_cgroup *memcg, *swap_memcg;
7171         unsigned int nr_entries;
7172         unsigned short oldid;
7173
7174         VM_BUG_ON_PAGE(PageLRU(page), page);
7175         VM_BUG_ON_PAGE(page_count(page), page);
7176
7177         if (mem_cgroup_disabled())
7178                 return;
7179
7180         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7181                 return;
7182
7183         memcg = page_memcg(page);
7184
7185         VM_WARN_ON_ONCE_PAGE(!memcg, page);
7186         if (!memcg)
7187                 return;
7188
7189         /*
7190          * In case the memcg owning these pages has been offlined and doesn't
7191          * have an ID allocated to it anymore, charge the closest online
7192          * ancestor for the swap instead and transfer the memory+swap charge.
7193          */
7194         swap_memcg = mem_cgroup_id_get_online(memcg);
7195         nr_entries = thp_nr_pages(page);
7196         /* Get references for the tail pages, too */
7197         if (nr_entries > 1)
7198                 mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
7199         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg),
7200                                    nr_entries);
7201         VM_BUG_ON_PAGE(oldid, page);
7202         mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
7203
7204         page->memcg_data = 0;
7205
7206         if (!mem_cgroup_is_root(memcg))
7207                 page_counter_uncharge(&memcg->memory, nr_entries);
7208
7209         if (!cgroup_memory_noswap && memcg != swap_memcg) {
7210                 if (!mem_cgroup_is_root(swap_memcg))
7211                         page_counter_charge(&swap_memcg->memsw, nr_entries);
7212                 page_counter_uncharge(&memcg->memsw, nr_entries);
7213         }
7214
7215         /*
7216          * Interrupts should be disabled here because the caller holds the
7217          * i_pages lock which is taken with interrupts-off. It is
7218          * important here to have the interrupts disabled because it is the
7219          * only synchronisation we have for updating the per-CPU variables.
7220          */
7221         memcg_stats_lock();
7222         mem_cgroup_charge_statistics(memcg, -nr_entries);
7223         memcg_stats_unlock();
7224         memcg_check_events(memcg, page_to_nid(page));
7225
7226         css_put(&memcg->css);
7227 }
7228
7229 /**
7230  * __mem_cgroup_try_charge_swap - try charging swap space for a page
7231  * @page: page being added to swap
7232  * @entry: swap entry to charge
7233  *
7234  * Try to charge @page's memcg for the swap space at @entry.
7235  *
7236  * Returns 0 on success, -ENOMEM on failure.
7237  */
7238 int __mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
7239 {
7240         unsigned int nr_pages = thp_nr_pages(page);
7241         struct page_counter *counter;
7242         struct mem_cgroup *memcg;
7243         unsigned short oldid;
7244
7245         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
7246                 return 0;
7247
7248         memcg = page_memcg(page);
7249
7250         VM_WARN_ON_ONCE_PAGE(!memcg, page);
7251         if (!memcg)
7252                 return 0;
7253
7254         if (!entry.val) {
7255                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7256                 return 0;
7257         }
7258
7259         memcg = mem_cgroup_id_get_online(memcg);
7260
7261         if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
7262             !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
7263                 memcg_memory_event(memcg, MEMCG_SWAP_MAX);
7264                 memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
7265                 mem_cgroup_id_put(memcg);
7266                 return -ENOMEM;
7267         }
7268
7269         /* Get references for the tail pages, too */
7270         if (nr_pages > 1)
7271                 mem_cgroup_id_get_many(memcg, nr_pages - 1);
7272         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages);
7273         VM_BUG_ON_PAGE(oldid, page);
7274         mod_memcg_state(memcg, MEMCG_SWAP, nr_pages);
7275
7276         return 0;
7277 }
7278
7279 /**
7280  * __mem_cgroup_uncharge_swap - uncharge swap space
7281  * @entry: swap entry to uncharge
7282  * @nr_pages: the amount of swap space to uncharge
7283  */
7284 void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
7285 {
7286         struct mem_cgroup *memcg;
7287         unsigned short id;
7288
7289         id = swap_cgroup_record(entry, 0, nr_pages);
7290         rcu_read_lock();
7291         memcg = mem_cgroup_from_id(id);
7292         if (memcg) {
7293                 if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
7294                         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
7295                                 page_counter_uncharge(&memcg->swap, nr_pages);
7296                         else
7297                                 page_counter_uncharge(&memcg->memsw, nr_pages);
7298                 }
7299                 mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages);
7300                 mem_cgroup_id_put_many(memcg, nr_pages);
7301         }
7302         rcu_read_unlock();
7303 }
7304
7305 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
7306 {
7307         long nr_swap_pages = get_nr_swap_pages();
7308
7309         if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7310                 return nr_swap_pages;
7311         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
7312                 nr_swap_pages = min_t(long, nr_swap_pages,
7313                                       READ_ONCE(memcg->swap.max) -
7314                                       page_counter_read(&memcg->swap));
7315         return nr_swap_pages;
7316 }
7317
7318 bool mem_cgroup_swap_full(struct page *page)
7319 {
7320         struct mem_cgroup *memcg;
7321
7322         VM_BUG_ON_PAGE(!PageLocked(page), page);
7323
7324         if (vm_swap_full())
7325                 return true;
7326         if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
7327                 return false;
7328
7329         memcg = page_memcg(page);
7330         if (!memcg)
7331                 return false;
7332
7333         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
7334                 unsigned long usage = page_counter_read(&memcg->swap);
7335
7336                 if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
7337                     usage * 2 >= READ_ONCE(memcg->swap.max))
7338                         return true;
7339         }
7340
7341         return false;
7342 }
7343
7344 static int __init setup_swap_account(char *s)
7345 {
7346         if (!strcmp(s, "1"))
7347                 cgroup_memory_noswap = false;
7348         else if (!strcmp(s, "0"))
7349                 cgroup_memory_noswap = true;
7350         return 1;
7351 }
7352 __setup("swapaccount=", setup_swap_account);
7353
7354 static u64 swap_current_read(struct cgroup_subsys_state *css,
7355                              struct cftype *cft)
7356 {
7357         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
7358
7359         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
7360 }
7361
7362 static int swap_high_show(struct seq_file *m, void *v)
7363 {
7364         return seq_puts_memcg_tunable(m,
7365                 READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
7366 }
7367
7368 static ssize_t swap_high_write(struct kernfs_open_file *of,
7369                                char *buf, size_t nbytes, loff_t off)
7370 {
7371         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7372         unsigned long high;
7373         int err;
7374
7375         buf = strstrip(buf);
7376         err = page_counter_memparse(buf, "max", &high);
7377         if (err)
7378                 return err;
7379
7380         page_counter_set_high(&memcg->swap, high);
7381
7382         return nbytes;
7383 }
7384
7385 static int swap_max_show(struct seq_file *m, void *v)
7386 {
7387         return seq_puts_memcg_tunable(m,
7388                 READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
7389 }
7390
7391 static ssize_t swap_max_write(struct kernfs_open_file *of,
7392                               char *buf, size_t nbytes, loff_t off)
7393 {
7394         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
7395         unsigned long max;
7396         int err;
7397
7398         buf = strstrip(buf);
7399         err = page_counter_memparse(buf, "max", &max);
7400         if (err)
7401                 return err;
7402
7403         xchg(&memcg->swap.max, max);
7404
7405         return nbytes;
7406 }
7407
7408 static int swap_events_show(struct seq_file *m, void *v)
7409 {
7410         struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
7411
7412         seq_printf(m, "high %lu\n",
7413                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
7414         seq_printf(m, "max %lu\n",
7415                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
7416         seq_printf(m, "fail %lu\n",
7417                    atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL]));
7418
7419         return 0;
7420 }
7421
7422 static struct cftype swap_files[] = {
7423         {
7424                 .name = "swap.current",
7425                 .flags = CFTYPE_NOT_ON_ROOT,
7426                 .read_u64 = swap_current_read,
7427         },
7428         {
7429                 .name = "swap.high",
7430                 .flags = CFTYPE_NOT_ON_ROOT,
7431                 .seq_show = swap_high_show,
7432                 .write = swap_high_write,
7433         },
7434         {
7435                 .name = "swap.max",
7436                 .flags = CFTYPE_NOT_ON_ROOT,
7437                 .seq_show = swap_max_show,
7438                 .write = swap_max_write,
7439         },
7440         {
7441                 .name = "swap.events",
7442                 .flags = CFTYPE_NOT_ON_ROOT,
7443                 .file_offset = offsetof(struct mem_cgroup, swap_events_file),
7444                 .seq_show = swap_events_show,
7445         },
7446         { }     /* terminate */
7447 };
7448
7449 static struct cftype memsw_files[] = {
7450         {
7451                 .name = "memsw.usage_in_bytes",
7452                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
7453                 .read_u64 = mem_cgroup_read_u64,
7454         },
7455         {
7456                 .name = "memsw.max_usage_in_bytes",
7457                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
7458                 .write = mem_cgroup_reset,
7459                 .read_u64 = mem_cgroup_read_u64,
7460         },
7461         {
7462                 .name = "memsw.limit_in_bytes",
7463                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
7464                 .write = mem_cgroup_write,
7465                 .read_u64 = mem_cgroup_read_u64,
7466         },
7467         {
7468                 .name = "memsw.failcnt",
7469                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
7470                 .write = mem_cgroup_reset,
7471                 .read_u64 = mem_cgroup_read_u64,
7472         },
7473         { },    /* terminate */
7474 };
7475
7476 /*
7477  * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
7478  * instead of a core_initcall(), this could mean cgroup_memory_noswap still
7479  * remains set to false even when memcg is disabled via "cgroup_disable=memory"
7480  * boot parameter. This may result in premature OOPS inside
7481  * mem_cgroup_get_nr_swap_pages() function in corner cases.
7482  */
7483 static int __init mem_cgroup_swap_init(void)
7484 {
7485         /* No memory control -> no swap control */
7486         if (mem_cgroup_disabled())
7487                 cgroup_memory_noswap = true;
7488
7489         if (cgroup_memory_noswap)
7490                 return 0;
7491
7492         WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
7493         WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
7494
7495         return 0;
7496 }
7497 core_initcall(mem_cgroup_swap_init);
7498
7499 #endif /* CONFIG_MEMCG_SWAP */